In [8]:
import pandas as pd
import pickle
import numpy as np
import shapefile
from scipy.spatial import cKDTree
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import TruncatedSVD

### DataFrame

In [108]:
df = pd.read_csv('./input/train.csv')

def cleanX(xdata):
    toDrop = ['Address','Block','Street', 'AddressNumberAndStreet']
    
    toRet = xdata.copy()
    toRet['Date'] = pd.to_datetime(toRet['Date'])
    
    toRet['Yr'] = [t.year for t in toRet['Date']]
    toRet['Mo'] = [t.month for t in toRet['Date']]
    #toRet['Day'] = [t.day for t in toRet['Date']]
    toRet['Week'] = [t.week for t in toRet['Date']]
    
    toRet['Location'] = [(toRet.loc[idx,'Longitude'], toRet.loc[idx,'Latitude'])
                  for idx in toRet.index]
    
    toRet = toRet.drop(toDrop, axis = 'columns')
    
    #toRet['Yr'] = toRet['Date'].dt.year
    
    return toRet

df = cleanX(df)

df.head(2)

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Yr,Mo,Week,Location
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,22,"(-87.800991, 41.95469)"
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,22,"(-87.800991, 41.95469)"


### Weather

In [109]:
weather = pd.read_csv('./input/weather.csv')

#weather info: years: 2007-2014 months: 5,6,7,8,9,10, 1 observation per day per station

#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level


#Convert datatypes
weather['Date'] = pd.to_datetime(weather['Date'])

weather['Yr'] = weather['Date'].dt.year
weather['Mo'] = weather['Date'].dt.month
weather['Day'] = weather['Date'].dt.day

#Drop columns that we are not interested in 
weather = weather.drop({'Depart', 'Depth', 'Water1', 'SnowFall', 'CodeSum', 'Heat', 'Cool'}, axis=1)

#Replace 'M' = missing value with NaN & '  T' with NaN
weather = weather.replace('M', np.nan)
#Replace'  T' with 0.001 
weather = weather.replace('  T', 0.001)
#Replace'  -' with 0000 
weather = weather.replace('-', '0000')

#Convert datatypes
weather['Tavg'] = weather['Tavg'].astype(float)
weather['WetBulb'] = weather['WetBulb'].astype(float)

weather['Sunrise'] = pd.to_datetime(weather['Sunrise'], format="%H%M")

#Fix some data ex: 16:60 --> 1700
weather['Sunset'] = [date if date[-2:] != '60' else str(int(date[0:2])+1)+'00' for date in weather['Sunset']]
weather['Sunset'] = pd.to_datetime(weather['Sunset'], format="%H%M")

weather['PrecipTotal'] = weather['PrecipTotal'].astype(float)
weather['StnPressure'] = weather['StnPressure'].astype(float)
weather['SeaLevel'] = weather['SeaLevel'].astype(float)
weather['ResultSpeed'] = weather['ResultSpeed'].astype(float)
weather['AvgSpeed'] = weather['AvgSpeed'].astype(float)

#Drop null values 
weather.dropna(inplace=True)

## At this point, Weather is in good shape
#Weather from Station 1 
weather_st1 = weather[weather['Station']== 1]

#Weather from Station 2
weather_st2 = weather[weather['Station']== 2]

In [110]:
def aggWeatherForObs(data, weather):
    agg = []

    def calculate_agregate (trap_df, weather_sub, end_date):

        toRet = pd.Series()

        toRet['Trap'] = trap_df.iloc[0,2] 
        toRet['Date_end'] = pd.to_datetime(end_date)
        #print(pd.to_datetime(end_date))
        for c in ['DewPoint', 'StnPressure', 'AvgSpeed']:
            for f in [np.max, np.min, np.mean]:
                toRet.loc[c+str(f).split(' ')[1]] = f(weather_sub[c])

        toRet['temp_max'] = weather_sub['Tmax'].max()
        toRet['temp_min'] = weather_sub['Tmin'].min()
        toRet['temp_avg'] = weather_sub['Tavg'].mean()

        toRet['precip_total'] = weather_sub['PrecipTotal'].sum()
        toRet['precip_avg'] = toRet['precip_total'] / len(weather_sub)
        #print(type(toRet['Date_end']))
        return toRet

    trapDFs =[]
    for trap in data['Trap'].unique():
        trapDFs.append(df[ df['Trap'] == trap ])

    for tDF in trapDFs:

        dates = tDF.sort_values('Date')['Date'].unique()
        dates = np.insert(dates, 0, dates[0] - pd.Timedelta(days=15))

        dateRanges=[]

        for i in range(len(dates)-1):
            dateRanges.append( (dates[i] , dates[i+1]) )

            start_date = dateRanges[i][0]
            end_date = dateRanges[i][1]

            #Select dataframe rows between two dates in weather
            mask = (weather['Date'] > start_date) & (weather['Date'] <= end_date)

            weather_sub = weather_st1.loc[mask]
            
            agg.append( calculate_agregate(tDF, weather_sub, end_date) )
    toRet = pd.DataFrame(agg)
    toRet['Date_end'] = [pd.to_datetime(d) for d in toRet['Date_end']]
    return toRet

In [111]:
weatherObs = aggWeatherForObs(df, weather_st1)
df_w = df.merge( weatherObs, left_on=['Date', 'Trap'],
              right_on=['Date_end', 'Trap'],
              how='outer')
df_weather.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Yr,Mo,...,Tmin_min,Tavg_avg,DewPoint_max,DewPoint_min,DewPoint_avg,StnPressure_max,StnPressure_min,StnPressure_avg,PrecipTotal_total,AvgSpeed_avg
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,...,42,64.4667,58,30,46,29.55,29.13,29.384,1.662,10.0133
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,...,42,64.4667,58,30,46,29.55,29.13,29.384,1.662,10.0133
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,9,1,0,2007,5,...,42,64.4667,58,30,46,29.55,29.13,29.384,1.662,10.0133
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,8,1,0,2007,5,...,42,64.4667,58,30,46,29.55,29.13,29.384,1.662,10.0133
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,8,4,0,2007,5,...,42,64.4667,58,30,46,29.55,29.13,29.384,1.662,10.0133


### Parks/Water

In [112]:
parkDir = './AddData/Parks/'
waterDir = './AddData/Water/'
uniqueLocs = df['Location'].unique()

def buildWaterAndParkDicts(parkDir, waterDir):

    # Park and water shapefile names
    parks = [f for f in listdir(parkDir) if isfile(join(parkDir,f)) if f.count('.csv') ==0]
    water = [f for f in listdir(waterDir) if isfile(join(waterDir,f))if f.count('.csv') ==0]

    parkShape = parkDir + parks[0].split('.')[0]
    waterShape = waterDir + water[0].split('.')[0]

    # Read in shapefiles, then the shapes/records
    psf = shapefile.Reader(parkShape)
    wsf = shapefile.Reader(waterShape)

    parkSR = psf.shapeRecords()
    waterSR = wsf.shapeRecords()

    # Create a parksize dictionary, keyed on parkname
    parkSize = {}
    for s in parkSR:
        parkSize[s.record[4]] = s.record[19]

    # Create cKDTree functions in Dict

    # Key: Identifier (number for water, park name for park)
    # Value: cKDTree function built on all the points associated with water/park feature

    waterFinder={}
    for i, s in enumerate(waterSR):
        waterFinder[i] = cKDTree(s.shape.points)

    parkFinder = {}
    for s in parkSR:
        parkFinder[s.record[4]] = cKDTree(s.shape.points)
    
    return parkSize, parkFinder, waterFinder

def yeildParkSVD(parkSize, parkFinder, uniqueLocs, TruncSVD = 'calc', comps = 4):
    
    parkDist = {}
    
    for l in uniqueLocs:
        parkDist[l] = {}
        for k in parkFinder:
            dist = parkFinder[k].query(l,1)[0]
            size = parkSize[k]
            parkDist[l][k] = (dist, size, size/(dist**2))
    
    parkDF = pd.DataFrame()
    parkDF = parkDF.from_dict(parkDist)
    parkDF = parkDF.transpose()
    parkDF.index = [idx for idx in parkDF.index]
    
    for c in parkDF:
        parkDF[c+' Area'] = [e[1] for e in parkDF[c]]
        parkDF[c+ ' Effect'] = [e[2] for e in parkDF[c]]
        parkDF[c] = [e[0] for e in parkDF[c]]
    
    if TruncSVD == 'calc':
        TruncSVD = TruncatedSVD(n_components = comps)
        TruncSVD.fit(parkDF)
    
    toRet = TruncSVD.transform(parkDF)
    
    toRet = pd.DataFrame(toRet, index = parkDF.index)
    return toRet, TruncSVD

def yeildWaterSVD(waterFinder, uniqueLocs, TruncSVD = 'calc', comps = 4):
    
    waterDist = {}
    
    for l in uniqueLocs:
        waterDist[l] = {}
        for k in waterFinder:
            waterDist[l][k] = waterFinder[k].query(l,1)[0]
    
    waterDF = pd.DataFrame()
    waterDF = waterDF.from_dict(waterDist)
    waterDF = waterDF.transpose()
    waterDF.index = [idx for idx in waterDF.index]
     
    if TruncSVD == 'calc':
        TruncSVD = TruncatedSVD(n_components = comps)
        TruncSVD.fit(waterDF)
    
    toRet = TruncSVD.transform(waterDF)
    
    toRet = pd.DataFrame(toRet, index = waterDF.index)
    return toRet, TruncSVD    

ps, pf, wf = buildWaterAndParkDicts(parkDir, waterDir)
pdf, ptsvd = yeildParkSVD(ps, pf, uniqueLocs, comps = 6)
wdf, wtsvd = yeildWaterSVD(wf, uniqueLocs, comps = 6)
pdf.columns = ['Park'+str(c) for c in pdf.columns]
wdf.columns = ['Water'+str(c) for c in wdf.columns]

In [113]:
df_a = df_w.merge(pdf, how = 'outer', left_on='Location', right_index=True)
df_a = df_a.merge(wdf, how = 'outer', left_on = 'Location', right_index = True)

In [114]:
df_a.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Yr,Mo,...,Park2,Park3,Park4,Park5,Water0,Water1,Water2,Water3,Water4,Water5
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,...,7622.901305,19319.121116,6090.6478,57976.436337,5.445062,-2.208052,0.66556,-0.069287,0.060115,-0.061254
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,9,1,0,2007,5,...,7622.901305,19319.121116,6090.6478,57976.436337,5.445062,-2.208052,0.66556,-0.069287,0.060115,-0.061254
25,2007-06-05,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,9,3,0,2007,6,...,7622.901305,19319.121116,6090.6478,57976.436337,5.445062,-2.208052,0.66556,-0.069287,0.060115,-0.061254
26,2007-06-05,CULEX RESTUANS,T002,41.95469,-87.800991,9,5,0,2007,6,...,7622.901305,19319.121116,6090.6478,57976.436337,5.445062,-2.208052,0.66556,-0.069287,0.060115,-0.061254
27,2007-06-05,CULEX PIPIENS,T002,41.95469,-87.800991,9,1,0,2007,6,...,7622.901305,19319.121116,6090.6478,57976.436337,5.445062,-2.208052,0.66556,-0.069287,0.060115,-0.061254


### Spray


In [115]:
spray = pd.read_csv('./input/spray.csv')
spray['DT'] = [spray.loc[idx, 'Date']+' '+ spray.loc[idx,'Time']
               if spray.loc[idx,'Time']== str else spray.loc[idx, 'Date']
               for idx in spray.index]

spray['Date'] = pd.to_datetime(spray['DT'])

spray['Yr'] = [t.year for t in spray['Date']]
spray['Mo'] = [t.month for t in spray['Date']]
spray['Day'] = [t.day for t in spray['Date']]
spray['Hour'] =[t.hour + (t.minute/60) for t in spray['Date']]

spray['Location'] = [(spray.loc[idx,'Longitude'], spray.loc[idx,'Latitude'])
                  for idx in spray.index]

spray = spray.drop(['Time', 'DT'], axis = 'columns')

spray.head(2)

Unnamed: 0,Date,Latitude,Longitude,Yr,Mo,Day,Hour,Location
0,2011-08-29,42.391623,-88.089163,2011,8,29,0.0,"(-88.0891633333, 42.3916233333)"
1,2011-08-29,42.391348,-88.089163,2011,8,29,0.0,"(-88.0891633333, 42.3913483333)"


### Spray Categorical

In [116]:
spray_dict = {}

for i in spray['Date'].dt.date.unique():
    spray_dict[i] = cKDTree( list(spray[spray['Date'] >= i ].loc[:,'Location']) )

In [117]:
spray_dict = {}

for i in spray['Date'].dt.date.unique():
    spray_dict[i] = cKDTree( list(spray[spray['Date'] >= i ].loc[:,'Location']) )

def nearSpray(data, spray_dict, dist = .1):
    toRet = {}
    
    spray_dates = list(spray_dict.keys())
    spray_dates.sort(reverse = True)
    firstSpray = spray_dates[-1]
    
    for l in data['Location'].unique():
        toRet[l] = {}
        trapDates = data[ data['Location'] ==l]['Date'].unique()
        trapDates.sort()
        trapDates = [pd.Timestamp(d).date() for d in trapDates]
        if trapDates[-1] > firstSpray:
            for tDate in trapDates:
                for sDate in spray_dates:
                    before, after = False, False
                    
                    if ((tDate - sDate < pd.Timedelta(0)) &
                    (tDate-sDate > pd.Timedelta(days=-8)) & 
                   (spray_dict[sDate].query(l,1)[0]*69 < dist)):
                        before = True
                    
                    if ((tDate - sDate > pd.Timedelta(0)) &
                    (tDate-sDate < pd.Timedelta(days=8)) & 
                   (spray_dict[sDate].query(l,1)[0]*69 < dist)):
                        #print('hi')
                        after = True
                    
                toRet[l][tDate] = (before, after)
                
                toRet2 = {}
                for k1 in toRet:
                    for k2 in toRet[k1]:
                        toRet2[(k1,k2)] = toRet[k1][k2]
                
    df = pd.DataFrame().from_dict(toRet2).transpose()
    df.columns = ['SprayBefore', 'SprayAfter']
    df['Loc'] = df.index.map(lambda x: x[0])
    df['Date'] = df.index.map(lambda x: x[1])
    df.index = range(0,len(df))
                
    return df
nearspray = nearSpray(df, spray_dict)
nearspray.head()

Unnamed: 0,SprayBefore,SprayAfter,Loc,Date
0,False,False,"(-87.930995, 41.957799)",2009-06-03
1,False,False,"(-87.930995, 41.957799)",2009-06-05
2,False,False,"(-87.930995, 41.957799)",2009-06-15
3,False,False,"(-87.930995, 41.957799)",2009-06-22
4,False,False,"(-87.930995, 41.957799)",2009-06-29


In [118]:
mask = (df_w['Yr'] == 2011) | (df_w['Yr']==2013)
df_s = df_w.loc[mask]
df_s = df_s.merge(nearspray)

In [119]:
df_s.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Yr,Mo,...,AvgSpeedamin,AvgSpeedmean,temp_max,temp_min,temp_avg,precip_total,precip_avg,SprayBefore,SprayAfter,Loc
0,2011-06-10,CULEX TERRITANS,T049,41.896282,-87.655232,8,1,0,2011,6,...,3.3,8.95977,96,29,64.762452,40.684,0.155877,False,False,"(-87.930995, 41.957799)"
1,2011-06-10,CULEX TERRITANS,T049,41.896282,-87.655232,8,1,0,2011,6,...,3.3,8.95977,96,29,64.762452,40.684,0.155877,False,False,"(-87.890615, 41.974689)"
2,2011-06-10,CULEX TERRITANS,T049,41.896282,-87.655232,8,1,0,2011,6,...,3.3,8.95977,96,29,64.762452,40.684,0.155877,False,False,"(-87.805059, 41.973845000000004)"
3,2011-06-10,CULEX TERRITANS,T049,41.896282,-87.655232,8,1,0,2011,6,...,3.3,8.95977,96,29,64.762452,40.684,0.155877,False,False,"(-87.800515, 41.916265)"
4,2011-06-10,CULEX TERRITANS,T049,41.896282,-87.655232,8,1,0,2011,6,...,3.3,8.95977,96,29,64.762452,40.684,0.155877,False,False,"(-87.785288, 41.923738)"


In [121]:
allDF = [df_w, df_a, df_s]
with open('allDF.pickle', 'wb') as f:
    pickle.dump(allDF, f, protocol = 0)
    
