In [1]:
import shapefile
from scipy.spatial import cKDTree
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression

#class ChicagoPreprocessor(object):#
#    def __init__(train, test = None):
#        self.parkFinder =


def agg_on_species(train, test):
    
    noAgg = [c for c in train.columns if c not in ['NumMosquitos','WnvPresent']]

    agg = train.groupby(noAgg)['NumMosquitos', 'WnvPresent'].sum()

    for i, c in enumerate(noAgg):
        agg[c] = agg.index.map(lambda x:x[i])

    agg.index = range(0,len(agg))
    agg['WnvPresent'] = (agg['WnvPresent'].map(lambda x:x>0)).astype(int)
    return agg, test

def InitPrepross(train, test):

    def location_add(df):
        df['Location'] = [(df.loc[idx,'Longitude'], df.loc[idx, 'Latitude'])
                            for idx in df.index]
        return df

    def change_date(df):
        df['Date'] = pd.to_datetime(df['Date'])

        return df

    def drop_unused(df):
        for col in ['Address','Block','Street',
              'AddressNumberAndStreet', 'AddressAccuracy',
                    ]:
            try:
                df = df.drop(col, axis = 'columns')
            except:
                print(col, 'not present')

        return df

    def species_dummies(df):
        species = ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
                'CULEX RESTUANS', 'CULEX SALINARIUS',
                'CULEX TERRITANS', 'CULEX TARSALIS',
                 'CULEX ERRATICUS']
        for s in species:
            df[s] = (df['Species'] == s).astype(int)

        return df

    def transform(df):
        df = drop_unused(df)
        df = location_add(df)
        df = change_date(df)
        df = species_dummies(df)
        return df
        
    return transform(train), transform(test)

In [31]:
#def WeatherProcess(train, test):

def yeildWeather(target):
    weather = pd.read_csv(target)
    weather['Date'] = pd.to_datetime(weather['Date'])

    toDrop = ['Depart', 'Depth','Water1',
            'SnowFall', 'CodeSum', 'Heat',
            'Cool', 'Sunrise']
    weather = weather.drop(toDrop, axis=1)

    toReplace = {'M':np.nan, '  T': 0.001, '-': '0000'}
    for k in toReplace:
        weather = weather.replace(k, toReplace[k])


    toFloats = ['Tavg', 'WetBulb', 'PrecipTotal','StnPressure',
                'SeaLevel', 'ResultSpeed','AvgSpeed']
    for c in toFloats:
        weather[c] = weather[c].astype(float)

    weather['Sunset'] = [date
                        if date[-2:] != '60'
                        else str(int(date[0:2])+1)+'00'
                        for date in weather['Sunset']]

    weather['Sunset'] = pd.to_datetime(weather['Sunset'],
                                        format="%H%M")
    weather.dropna(inplace=True)

    return weather[weather['Station']== 1]

def yeildAvgTemp(weather):
    weather['Wk'] = weather['Date'].dt.week
    weekTemp = pd.DataFrame(
                    weather.groupby('Wk')['Tavg'].mean())
    weekTemp['Week'] = weekTemp.index - 17
    weekTemp['Week^2'] = weekTemp['Week']**2

    lr = LinearRegression().fit(weekTemp.drop('Tavg', axis = 'columns'),
                                weekTemp['Tavg'])
    toRet = {}
    for w in range(1,53):
        toRet[w] = lr.intercept_ + (lr.coef_[0]*(w-17)) + (lr.coef_[1] * ((w-17)**2))

    return toRet

def calculate_agregate( weather_sub, avgTDict):
    toRet = pd.Series()

    allAgg = [np.max, np.min, np.mean]
    toAgg = {'DewPoint': allAgg,
            'StnPressure': allAgg,
            'AvgSpeed': allAgg,
            'Tmax':[np.max],
            'Tmin':[np.min],
            'Tavg':[np.mean],
            'PrecipTotal':[np.sum, np.mean]
            }
    for k in toAgg:
        for f in toAgg[k]:
            toRet.loc[k + str(f).split(' ')[1]] = f(weather_sub[k])

    finalEntry = weather_sub.iloc[len(weather_sub)-1]

    toRet['temp_expected'] = avgTDict[pd.to_datetime(finalEntry['Date']).week]
    toRet['temp_diff'] = toRet['Tavgmean'] - toRet['temp_expected']

    sunset = finalEntry['Sunset']
    toRet['sunset'] = sunset.hour + (sunset.minute / 60)

    return toRet

def date_ranges(dates):
    uniqueYears = set([pd.to_datetime(d).year for d in dates])

    dates = sorted(dates)
    fyear = []
    for y in uniqueYears:
        for d in dates:
            if pd.to_datetime(d).year == y:
                print(d)
                fyear.append(d)
                break

    for d in fyear:
        dates = np.insert(dates, 0, d - pd.Timedelta(days = 8))
        
    dateRanges = []
    for i in range(len(dates)-1):
        if pd.to_datetime(dates[i]).year == pd.to_datetime(dates[i+1]).year:
            dateRanges.append( (dates[i], dates[i+1]) )

    return dateRanges

def subset_weather(dateRange, weather):
    mask = (weather['Date']>dateRange[0]) & (weather['Date'] <= dateRange[1])
    return weather.loc[mask]

def TWeatherDFMaker(dct):
    toRet = pd.DataFrame().from_dict(dct)
    toRet = toRet.transpose()
    toRet.index = [idx for idx in toRet.index]
    toRet['Trap'] = toRet.index.map(lambda x: x[0])
    toRet['Date'] = toRet.index.map(lambda x: x[1])
    toRet.index = range(len(toRet))

    return toRet

def trap_agregator(trap_df, weather, avgTDict):
    trapWeather = {}
    trap = trap_df['Trap'].iloc[0]

    dates = trap_df['Date'].unique()
    dates = sorted(dates)

    dateRanges = date_ranges(dates)

    for dr in dateRanges:
        weather_sub = subset_weather(dr, weather)
        trapWeather[(trap, dr[1])] = calculate_agregate(weather_sub, avgTDict)
    toRet = pd.DataFrame().from_dict(trapWeather)

    return TWeatherDFMaker(trapWeather)

def transform(df):
    observations = []

    traps = df['Trap'].unique()
    for t in traps:
        observations.append(trap_agregator(df[df['Trap'] == t],
        weather, avgTDict))
    toRet = pd.concat(observations, axis = 'rows')

    return toRet

weatherTarget = './input/weather.csv'
weather = yeildWeather(weatherTarget)
avgTDict = yeildAvgTemp(weather)



In [32]:
calcd = trap_agregator(test[test['Trap'] == 'T002'], weather, avgTDict).Date.unique()

2008-06-11T00:00:00.000000000
2010-06-02T00:00:00.000000000
2012-06-08T00:00:00.000000000
2014-06-05T00:00:00.000000000


In [25]:
reald = test[test['Trap']== 'T002'].Date.unique()

In [30]:
reald

array(['2008-06-11T00:00:00.000000000', '2008-06-17T00:00:00.000000000',
       '2008-06-24T00:00:00.000000000', '2008-07-01T00:00:00.000000000',
       '2008-07-04T00:00:00.000000000', '2008-07-11T00:00:00.000000000',
       '2008-07-14T00:00:00.000000000', '2008-07-21T00:00:00.000000000',
       '2008-07-23T00:00:00.000000000', '2008-07-24T00:00:00.000000000',
       '2008-07-28T00:00:00.000000000', '2008-07-29T00:00:00.000000000',
       '2008-08-04T00:00:00.000000000', '2008-08-05T00:00:00.000000000',
       '2008-08-12T00:00:00.000000000', '2008-08-13T00:00:00.000000000',
       '2008-08-19T00:00:00.000000000', '2008-08-25T00:00:00.000000000',
       '2008-08-26T00:00:00.000000000', '2008-09-02T00:00:00.000000000',
       '2008-09-03T00:00:00.000000000', '2008-09-09T00:00:00.000000000',
       '2008-09-15T00:00:00.000000000', '2008-09-19T00:00:00.000000000',
       '2008-09-29T00:00:00.000000000', '2010-06-02T00:00:00.000000000',
       '2010-06-11T00:00:00.000000000', '2010-06-18

In [29]:
for rd in reald:
    if rd in calcd:
        pass
    else:
        print(rd)

2010-06-02T00:00:00.000000000
2012-06-08T00:00:00.000000000
2014-06-05T00:00:00.000000000


In [7]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv', index_col= 0)

train, test = InitPrepross(train,test)

In [8]:
t2 = test
t2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116293 entries, 1 to 116293
Data columns (total 13 columns):
Date                      116293 non-null datetime64[ns]
Species                   116293 non-null object
Trap                      116293 non-null object
Latitude                  116293 non-null float64
Longitude                 116293 non-null float64
Location                  116293 non-null object
CULEX PIPIENS             116293 non-null int32
CULEX PIPIENS/RESTUANS    116293 non-null int32
CULEX RESTUANS            116293 non-null int32
CULEX SALINARIUS          116293 non-null int32
CULEX TERRITANS           116293 non-null int32
CULEX TARSALIS            116293 non-null int32
CULEX ERRATICUS           116293 non-null int32
dtypes: datetime64[ns](1), float64(2), int32(7), object(3)
memory usage: 14.3+ MB


In [9]:
%%time
wtr, wte = WeatherProcess(train, test)

In [12]:
train.shape

(10506, 15)

In [13]:
test.shape

(116293, 13)

In [11]:
wtr.shape

(4343, 19)