In [12]:
import pandas as pd
import pickle
import numpy as np
import shapefile
from scipy.spatial import cKDTree
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import TruncatedSVD

### DataFrame

In [13]:
df = pd.read_csv('./input/train.csv')

noAgg = [c for c in df.columns if c not in ['NumMosquitos','WnvPresent']]

agg = df.groupby(noAgg)['NumMosquitos', 'WnvPresent'].sum()

for i, c in enumerate(noAgg):
    agg[c] = agg.index.map(lambda x:x[i])

agg.index = range(0,len(agg))

agg['WnvPresent'] = (agg['WnvPresent'].map(lambda x:x>0)).astype(int)

df = agg

def cleanX(xdata):
    species = ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
                'CULEX RESTUANS', 'CULEX SALINARIUS',
                'CULEX TERRITANS', 'CULEX TARSALIS',
                 'CULEX ERRATICUS']
    
    toDrop = [
        'Address','Block','Street',
              'AddressNumberAndStreet', 'AddressAccuracy',
             ]
    
    toRet = xdata.copy()
    toRet['Date'] = pd.to_datetime(toRet['Date'])
    
    toRet['Yr'] = [t.year for t in toRet['Date']]
    toRet['Mo'] = [t.month for t in toRet['Date']]
    #toRet['Day'] = [t.day for t in toRet['Date']]
    toRet['Week'] = [t.week for t in toRet['Date']]
    
    toRet['Location'] = [(toRet.loc[idx,'Longitude'], toRet.loc[idx,'Latitude'])
                  for idx in toRet.index]
    for s in species:
        toRet[s] = (toRet['Species'] == s).astype(int)
        
    toRet = toRet.drop(toDrop, axis = 'columns')
    
    return toRet

df = cleanX(df)

df.head(2)
df.shape

(8475, 18)

### Weather

In [14]:
weather = pd.read_csv('./input/weather.csv')

#weather info: years: 2007-2014 months: 5,6,7,8,9,10, 1 observation per day per station

#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level


#Convert datatypes
weather['Date'] = pd.to_datetime(weather['Date'])

weather['Yr'] = weather['Date'].dt.year
weather['Mo'] = weather['Date'].dt.month
weather['Day'] = weather['Date'].dt.day

#Drop columns that we are not interested in 
weather = weather.drop({'Depart', 'Depth', 'Water1', 'SnowFall', 'CodeSum', 'Heat', 'Cool'}, axis=1)

#Replace 'M' = missing value with NaN & '  T' with NaN
weather = weather.replace('M', np.nan)
#Replace'  T' with 0.001 
weather = weather.replace('  T', 0.001)
#Replace'  -' with 0000 
weather = weather.replace('-', '0000')

#Convert datatypes
weather['Tavg'] = weather['Tavg'].astype(float)
weather['WetBulb'] = weather['WetBulb'].astype(float)

weather['Sunrise'] = pd.to_datetime(weather['Sunrise'], format="%H%M")

#Fix some data ex: 16:60 --> 1700
weather['Sunset'] = [date if date[-2:] != '60' else str(int(date[0:2])+1)+'00' for date in weather['Sunset']]
weather['Sunset'] = pd.to_datetime(weather['Sunset'], format="%H%M")

weather['PrecipTotal'] = weather['PrecipTotal'].astype(float)
weather['StnPressure'] = weather['StnPressure'].astype(float)
weather['SeaLevel'] = weather['SeaLevel'].astype(float)
weather['ResultSpeed'] = weather['ResultSpeed'].astype(float)
weather['AvgSpeed'] = weather['AvgSpeed'].astype(float)

#Drop null values 
weather.dropna(inplace=True)

## At this point, Weather is in good shape
#Weather from Station 1 
weather_st1 = weather[weather['Station']== 1]

#Weather from Station 2
weather_st2 = weather[weather['Station']== 2]

In [37]:
df.head(2)

Unnamed: 0,NumMosquitos,WnvPresent,Date,Species,Trap,Latitude,Longitude,Yr,Mo,Week,Location,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TERRITANS,CULEX TARSALIS,CULEX ERRATICUS
0,1,0,2007-05-29,CULEX PIPIENS/RESTUANS,T048,41.867108,-87.654224,2007,5,22,"(-87.654224, 41.867108)",0,1,0,0,0,0,0
1,2,0,2007-05-29,CULEX RESTUANS,T048,41.867108,-87.654224,2007,5,22,"(-87.654224, 41.867108)",0,0,1,0,0,0,0


In [53]:
from sklearn.linear_model import LinearRegression

def aggWeatherForObs(data, weather):
    agg = []

    weather['Week'] = weather['Date'].dt.week
    wtw = weather.groupby('Week')['Tavg'].mean()
    df = pd.DataFrame(wtw)
    
    df['Week'] = df.index -17
    df['Week^2'] = df['Week']**2
    
    lr = LinearRegression()
    lr.fit(df.drop('Tavg', axis = 'columns'), df['Tavg'])
    
    def weeklyAvgTemp(weeknum, lm):
        wk = weeknum - 17
        
        return lm.intercept_ + (lr.coef_[0]*wk) + (lr.coef_[1]* (wk**2))
    
    def calculate_agregate (trap_df, weather_sub, end_date):

        toRet = pd.Series()

        toRet['Trap'] = trap_df.iloc[0,4] 
        toRet['Date_end'] = pd.to_datetime(end_date)
        
        for c in ['DewPoint', 'StnPressure', 'AvgSpeed']:
            for f in [np.max, np.min, np.mean]:
                toRet.loc[c+str(f).split(' ')[1]] = f(weather_sub[c])

        toRet['temp_max'] = weather_sub['Tmax'].max()
        toRet['temp_min'] = weather_sub['Tmin'].min()
        toRet['temp_avg'] = weather_sub['Tavg'].mean()
        toRet['temp_expected'] = weeklyAvgTemp(pd.to_datetime(toRet['Date_end']).week , lr)
        toRet['temp_diff'] = toRet['temp_avg'] - toRet['temp_expected']
        
        #print(weather_sub['Date'])
        
        try:
            sunset = weather_sub.iloc[len(weather_sub)-1]['Sunset']
        except:
            print('no')
            
        toRet['sunset'] = sunset.hour + (sunset.minute / 60)
        
        toRet['precip_total'] = weather_sub['PrecipTotal'].sum()
        toRet['precip_avg'] = toRet['precip_total'] / len(weather_sub)
        
        return toRet
    
    trapDFs = []
    for trap in data['Trap'].unique():
        trapDFs.append(data[ data['Trap'] == trap ])

    for tDF in trapDFs:

        dates = tDF['Date'].unique()
        dates = sorted(dates)
        
        fyear= {}
        
        for y in [2007, 2009, 2011, 2013]: # needs to be generalized
            for i, d in enumerate(dates):
                if pd.to_datetime(d).year == y:
                    fyear[i] = d
                    break
                           
        for idx in fyear:
            dates = np.insert(dates, 0, fyear[idx] - pd.Timedelta(days = 8))
        dates = sorted(dates)
        
        dateRanges = []
        for i in range(len(dates)-1):
            if pd.to_datetime(dates[i]).year == pd.to_datetime(dates[i+1]).year:
                dateRanges.append( (dates[i] , dates[i+1]) )

        for dr in dateRanges:
            start_date = dr[0]
            end_date = dr[1]

            #Select dataframe rows between two dates in weather
            mask = (weather['Date'] > start_date) & (weather['Date'] <= end_date)
            
            weather_sub = weather.loc[mask]
            
            agg.append( calculate_agregate(tDF, weather_sub, end_date) )
            
    toRet = pd.DataFrame(agg)
    toRet['Date'] = [pd.to_datetime(d) for d in toRet['Date_end']]
    toRet = toRet.drop('Date_end', axis = 'columns')
    return toRet

In [36]:
weatherObs.head()

Unnamed: 0,Trap,DewPointamax,DewPointamin,DewPointmean,StnPressureamax,StnPressureamin,StnPressuremean,AvgSpeedamax,AvgSpeedamin,AvgSpeedmean,temp_max,temp_min,temp_avg,temp_expected,temp_diff,sunset,precip_total,precip_avg,Date
0,CULEX PIPIENS/RESTUANS,58,44,50.0,29.51,29.34,29.39625,17.3,5.3,9.575,89,49,68.375,67.484959,0.890041,19.283333,1.072,0.134,2007-05-29
1,CULEX PIPIENS/RESTUANS,63,48,58.714286,29.31,28.82,29.078571,10.6,6.4,8.328571,87,47,69.571429,69.319181,0.252248,19.383333,1.06,0.151429,2007-06-05
2,CULEX PIPIENS/RESTUANS,69,47,56.142857,29.44,28.84,29.275714,23.1,4.9,9.171429,93,44,72.952381,73.228946,-0.276565,19.516667,1.092,0.052,2007-06-26
3,CULEX PIPIENS/RESTUANS,70,46,54.666667,29.51,29.29,29.42,12.5,5.8,8.683333,91,53,68.0,74.001234,-6.001234,19.516667,0.15,0.025,2007-07-02
4,CULEX PIPIENS/RESTUANS,67,51,61.444444,29.32,29.05,29.186667,13.1,5.5,9.0,94,60,77.555556,74.508038,3.047517,19.466667,1.29,0.143333,2007-07-11


In [39]:
%%time
weatherObs = aggWeatherForObs(df, weather_st1)
df_w = df.merge( weatherObs, left_on=['Date', 'Trap'],
              right_on=['Date', 'Trap'],
              how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Wall time: 1min 9s


### Parks/Water

In [56]:
df_w.head(2)

Unnamed: 0,NumMosquitos,WnvPresent,Date,Species,Trap,Latitude,Longitude,Yr,Mo,Week,...,AvgSpeedamin,AvgSpeedmean,temp_max,temp_min,temp_avg,temp_expected,temp_diff,sunset,precip_total,precip_avg
0,1,0,2007-05-29,CULEX PIPIENS/RESTUANS,T048,41.867108,-87.654224,2007,5,22,...,5.3,9.575,89,49,68.375,67.484959,0.890041,19.283333,1.072,0.134
1,2,0,2007-05-29,CULEX RESTUANS,T048,41.867108,-87.654224,2007,5,22,...,5.3,9.575,89,49,68.375,67.484959,0.890041,19.283333,1.072,0.134


In [41]:
parkDir = './AddData/Parks/'
waterDir = './AddData/Water/'

uniqueLocs = df['Location'].unique()


def buildWaterAndParkDicts(parkDir, waterDir):

    # Park and water shapefile names
    parks = [f for f in listdir(parkDir) if isfile(join(parkDir,f)) if f.count('.csv') ==0]
    water = [f for f in listdir(waterDir) if isfile(join(waterDir,f))if f.count('.csv') ==0]

    parkShape = parkDir + parks[0].split('.')[0]
    waterShape = waterDir + water[0].split('.')[0]

    # Read in shapefiles, then the shapes/records
    psf = shapefile.Reader(parkShape)
    wsf = shapefile.Reader(waterShape)

    parkSR = psf.shapeRecords()
    waterSR = wsf.shapeRecords()

    # Create a parksize dictionary, keyed on parkname
    parkSize = {}
    for s in parkSR:
        parkSize[s.record[4]] = s.record[19]

    # Create cKDTree functions in Dict

    # Key: Identifier (number for water, park name for park)
    # Value: cKDTree function built on all the points associated with water/park feature

    waterFinder={}
    for i, s in enumerate(waterSR):
        waterFinder[i] = cKDTree(s.shape.points)

    parkFinder = {}
    for s in parkSR:
        parkFinder[s.record[4]] = cKDTree(s.shape.points)
    
    return parkSize, parkFinder, waterFinder

def yeildParkSVD(parkSize, parkFinder, uniqueLocs, TruncSVD = 'calc', comps = 4):
    
    parkDist = {}
    
    for l in uniqueLocs:
        parkDist[l] = {}
        for k in parkFinder:
            dist = parkFinder[k].query(l,1)[0]
            size = parkSize[k]
            parkDist[l][k] = (dist, size, size/(dist**2))
    
    parkDF = pd.DataFrame()
    parkDF = parkDF.from_dict(parkDist)
    parkDF = parkDF.transpose()
    parkDF.index = [idx for idx in parkDF.index]
    
    for c in parkDF:
        parkDF[c+' Area'] = [e[1] for e in parkDF[c]]
        parkDF[c+ ' Effect'] = [e[2] for e in parkDF[c]]
        parkDF[c] = [e[0] for e in parkDF[c]]
    
    if TruncSVD == 'calc':
        TruncSVD = TruncatedSVD(n_components = comps)
        TruncSVD.fit(parkDF)
    
    toRet = TruncSVD.transform(parkDF)
    
    toRet = pd.DataFrame(toRet, index = parkDF.index)
    return toRet, TruncSVD

def yeildWaterSVD(waterFinder, uniqueLocs, TruncSVD = 'calc', comps = 4):
    
    waterDist = {}
    
    for l in uniqueLocs:
        waterDist[l] = {}
        for k in waterFinder:
            waterDist[l][k] = waterFinder[k].query(l,1)[0]
    
    waterDF = pd.DataFrame()
    waterDF = waterDF.from_dict(waterDist)
    waterDF = waterDF.transpose()
    waterDF.index = [idx for idx in waterDF.index]
     
    if TruncSVD == 'calc':
        TruncSVD = TruncatedSVD(n_components = comps)
        TruncSVD.fit(waterDF)
    
    toRet = TruncSVD.transform(waterDF)
    
    toRet = pd.DataFrame(toRet, index = waterDF.index)
    return toRet, TruncSVD    

ps, pf, wf = buildWaterAndParkDicts(parkDir, waterDir)
pdf, ptsvd = yeildParkSVD(ps, pf, uniqueLocs, comps = 6)
wdf, wtsvd = yeildWaterSVD(wf, uniqueLocs, comps = 6)
pdf.columns = ['Park'+str(c) for c in pdf.columns]
wdf.columns = ['Water'+str(c) for c in wdf.columns]

In [135]:
wdf.head()

Unnamed: 0,Water0,Water1,Water2,Water3,Water4,Water5
"(-87.930995, 41.957799)",7.452501,-2.415331,2.122308,-0.353222,0.585042,0.305456
"(-87.890615, 41.974689)",6.846056,-2.593422,1.839218,-0.280489,0.525015,0.210701
"(-87.862995, 41.992478000000006)",6.625241,-2.65285,1.583163,-0.151807,0.425006,0.121209
"(-87.832763, 41.944869)",5.744749,-2.252962,1.059791,-0.23373,0.214307,0.021081
"(-87.824812, 41.974089)",5.919241,-2.453215,1.058257,-0.087411,0.206612,-0.006295


In [42]:
df_a = df_w.merge(pdf, how = 'outer', left_on='Location', right_index=True)
df_a = df_a.merge(wdf, how = 'outer', left_on = 'Location', right_index = True)

In [55]:
df_a.head(2)

Unnamed: 0,NumMosquitos,WnvPresent,Date,Species,Trap,Latitude,Longitude,Yr,Mo,Week,...,Park2,Park3,Park4,Park5,Water0,Water1,Water2,Water3,Water4,Water5
0,1,0,2007-05-29,CULEX PIPIENS/RESTUANS,T048,41.867108,-87.654224,2007,5,22,...,29715.14289,67489.311127,61468.115233,3092.465154,3.82796,0.108809,-1.189728,-0.10479,0.184891,0.114866
1,2,0,2007-05-29,CULEX RESTUANS,T048,41.867108,-87.654224,2007,5,22,...,29715.14289,67489.311127,61468.115233,3092.465154,3.82796,0.108809,-1.189728,-0.10479,0.184891,0.114866


In [44]:
df_a.columns

Index(['NumMosquitos', 'WnvPresent', 'Date', 'Species', 'Trap', 'Latitude',
       'Longitude', 'Yr', 'Mo', 'Week', 'Location', 'CULEX PIPIENS',
       'CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX SALINARIUS',
       'CULEX TERRITANS', 'CULEX TARSALIS', 'CULEX ERRATICUS', 'DewPointamax',
       'DewPointamin', 'DewPointmean', 'StnPressureamax', 'StnPressureamin',
       'StnPressuremean', 'AvgSpeedamax', 'AvgSpeedamin', 'AvgSpeedmean',
       'temp_max', 'temp_min', 'temp_avg', 'temp_expected', 'temp_diff',
       'sunset', 'precip_total', 'precip_avg', 'Park0', 'Park1', 'Park2',
       'Park3', 'Park4', 'Park5', 'Water0', 'Water1', 'Water2', 'Water3',
       'Water4', 'Water5'],
      dtype='object')

### Spray


In [45]:
spray = pd.read_csv('./input/spray.csv')
spray['DT'] = [spray.loc[idx, 'Date']+' '+ spray.loc[idx,'Time']
               if spray.loc[idx,'Time']== str else spray.loc[idx, 'Date']
               for idx in spray.index]

spray['Date'] = pd.to_datetime(spray['DT'])

spray['Yr'] = [t.year for t in spray['Date']]
spray['Mo'] = [t.month for t in spray['Date']]
spray['Day'] = [t.day for t in spray['Date']]
spray['Hour'] =[t.hour + (t.minute/60) for t in spray['Date']]

spray['Location'] = [(spray.loc[idx,'Longitude'], spray.loc[idx,'Latitude'])
                  for idx in spray.index]

spray = spray.drop(['Time', 'DT'], axis = 'columns')

spray.head(2)

Unnamed: 0,Date,Latitude,Longitude,Yr,Mo,Day,Hour,Location
0,2011-08-29,42.391623,-88.089163,2011,8,29,0.0,"(-88.0891633333, 42.3916233333)"
1,2011-08-29,42.391348,-88.089163,2011,8,29,0.0,"(-88.0891633333, 42.3913483333)"


### Spray Categorical

In [46]:
spray_dict = {}

for i in spray['Date'].dt.date.unique():
    spray_dict[i] = cKDTree( list(spray[spray['Date'] >= i ].loc[:,'Location']) )

In [47]:
spray_dict = {}

for i in spray['Date'].dt.date.unique():
    spray_dict[i] = cKDTree( list(spray[spray['Date'] >= i ].loc[:,'Location']) )

def nearSpray(data, spray_dict, dist = .1):
    toRet = {}
    
    spray_dates = list(spray_dict.keys())
    spray_dates.sort(reverse = True)
    firstSpray = spray_dates[-1]
    
    for l in data['Location'].unique():
        toRet[l] = {}
        trapDates = data[ data['Location'] ==l]['Date'].unique()
        trapDates.sort()
        trapDates = [pd.Timestamp(d).date() for d in trapDates]
        if trapDates[-1] > firstSpray:
            for tDate in trapDates:
                for sDate in spray_dates:
                    before, after = False, False
                    
                    if ((tDate - sDate < pd.Timedelta(0)) &
                    (tDate-sDate > pd.Timedelta(days=-8)) & 
                   (spray_dict[sDate].query(l,1)[0]*69 < dist)):
                        before = True
                    
                    if ((tDate - sDate > pd.Timedelta(0)) &
                    (tDate-sDate < pd.Timedelta(days=8)) & 
                   (spray_dict[sDate].query(l,1)[0]*69 < dist)):
                        #print('hi')
                        after = True
                    
                toRet[l][tDate] = (before, after)
                
                toRet2 = {}
                for k1 in toRet:
                    for k2 in toRet[k1]:
                        toRet2[(k1,k2)] = toRet[k1][k2]
                
    df = pd.DataFrame().from_dict(toRet2).transpose()
    df.columns = ['SprayBefore', 'SprayAfter']
    df['Loc'] = df.index.map(lambda x: x[0])
    df['Date'] = df.index.map(lambda x: x[1])
    df.index = range(0,len(df))
                
    return df
nearspray = nearSpray(df, spray_dict)
nearspray.head()

Unnamed: 0,SprayBefore,SprayAfter,Loc,Date
0,False,False,"(-87.930995, 41.957799)",2009-06-03
1,False,False,"(-87.930995, 41.957799)",2009-06-05
2,False,False,"(-87.930995, 41.957799)",2009-06-15
3,False,False,"(-87.930995, 41.957799)",2009-06-22
4,False,False,"(-87.930995, 41.957799)",2009-06-29


In [48]:
mask = (df_w['Date'].dt.year == 2011) | (df_w['Date'].dt.year ==2013)
df_s = df_w.loc[mask]
df_s = df_s.merge(nearspray)

In [49]:
df_s.head()

Unnamed: 0,NumMosquitos,WnvPresent,Date,Species,Trap,Latitude,Longitude,Yr,Mo,Week,...,temp_min,temp_avg,temp_expected,temp_diff,sunset,precip_total,precip_avg,SprayBefore,SprayAfter,Loc
0,4,0,2011-06-10,CULEX RESTUANS,T073,41.773215,-87.60088,2011,6,23,...,50,73.25,69.319181,3.930819,19.433333,1.811,0.226375,False,False,"(-87.930995, 41.957799)"
1,4,0,2011-06-10,CULEX RESTUANS,T073,41.773215,-87.60088,2011,6,23,...,50,73.25,69.319181,3.930819,19.433333,1.811,0.226375,False,False,"(-87.890615, 41.974689)"
2,4,0,2011-06-10,CULEX RESTUANS,T073,41.773215,-87.60088,2011,6,23,...,50,73.25,69.319181,3.930819,19.433333,1.811,0.226375,False,False,"(-87.805059, 41.973845000000004)"
3,4,0,2011-06-10,CULEX RESTUANS,T073,41.773215,-87.60088,2011,6,23,...,50,73.25,69.319181,3.930819,19.433333,1.811,0.226375,False,False,"(-87.800515, 41.916265)"
4,4,0,2011-06-10,CULEX RESTUANS,T073,41.773215,-87.60088,2011,6,23,...,50,73.25,69.319181,3.930819,19.433333,1.811,0.226375,False,False,"(-87.785288, 41.923738)"


In [50]:
df_a.shape

(8475, 47)

In [54]:
allDF = [df_w, df_a, df_s]
with open('allDF.pickle', 'wb') as f:
    pickle.dump(allDF, f, protocol = 0)