In [2]:
import shapefile
from scipy.spatial import cKDTree
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression

In [44]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv', index_col= 0)

In [20]:
def agg_on_species(train, test):
    
    noAgg = [c for c in train.columns if c not in ['NumMosquitos','WnvPresent']]

    agg = train.groupby(noAgg)['NumMosquitos', 'WnvPresent'].sum()

    for i, c in enumerate(noAgg):
        agg[c] = agg.index.map(lambda x:x[i])

    agg.index = range(0,len(agg))
    agg['WnvPresent'] = (agg['WnvPresent'].map(lambda x:x>0)).astype(int)
    return agg, test

In [43]:
def InitPrepross(train, test):

    def location_add(df):
        df['Location'] = [(df.loc[idx,'Longitude'], df.loc[idx, 'Latitude'])
                            for idx in df.index]
        return df

    def change_date(df):
        df['Date'] = pd.to_datetime(df['Date'])

        return df

    def drop_unused(df):
        for col in ['Address','Block','Street',
              'AddressNumberAndStreet', 'AddressAccuracy',
                    ]:
            try:
                df = df.drop(col, axis = 'columns')
            except:
                print(col, 'not present')

        return df

    def species_dummies(df):
        species = ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
                'CULEX RESTUANS', 'CULEX SALINARIUS',
                'CULEX TERRITANS', 'CULEX TARSALIS',
                 'CULEX ERRATICUS']
        for s in species:
            df[s] = (df['Species'] == s).astype(int)

        return df

    def transform(df):
        df = drop_unused(df)
        df = location_add(df)
        df = change_date(df)
        df = species_dummies(df)
        return df
        
    return transform(train), transform(test)

In [46]:
tup = InitPrepross(train,test)

In [39]:
df.head(2)

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent,Location
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0,"(-87.800991, 41.95469)"
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0,"(-87.800991, 41.95469)"


In [24]:
tup = InitPrepross(train, test)

In [79]:

def LocationProcess(train, test):
    parkDir = './AddData/Parks/'
    waterDir = './AddData/Water/'

    def buildWaterFinder():
        water = [f for f in listdir(waterDir)
                if isfile(join(waterDir,f))
                if f.count('.csv') ==0]

        waterShape = waterDir + water[0].split('.')[0]
        waterSR = shapefile.Reader(waterShape).shapeRecords()

        waterFinder = {}
        for i, s in enumerate(waterSR):
            waterFinder[i] = cKDTree(s.shape.points)

        return waterFinder

    def buildParkDicts():
        parks = [f for f in listdir(parkDir)
                if isfile(join(parkDir,f))
                if f.count('.csv') ==0]
        parkShape = parkDir + parks[0].split('.')[0]
        parkSR = shapefile.Reader(parkShape).shapeRecords()

        parkFinder = {}
        parkSize = {}
        for s in parkSR:
            parkSize[s.record[4]] = s.record[19]
            parkFinder[s.record[4]] = cKDTree(s.shape.points)

        return parkFinder, parkSize

    def calculate_distances(loc, finder, size = None):
        Dist = {}
        for k in finder:
            Dist[k] = finder[k].query(loc, 1)[0]

        if size:
            toRet = {}
            for k in Dist:
                Dist[k] = (Dist[k], size[k], size[k]/(Dist[k]**2))
        return Dist

    def dfFromDict(dct):
        toRet = pd.DataFrame(dct)
        toRet = toRet.transpose()
        toRet.index = [idx for idx in toRet.index]

        if type(toRet.iloc[0,0]) == tuple:
            for c in toRet:
                toRet['P ' + str(c) + ' A'] = [e[1] for e in toRet[c]]
                toRet['P ' + str(c) + ' E'] = [e[2] for e in toRet[c]]
                toRet['P ' + str(c)] = [e[0] for e in toRet[c]]
                toRet = toRet.drop(c, axis = 'columns')
        else:
            toRet.columns = ['W ' + str(c) for c in toRet.columns]

        return toRet

    def info(df, finder, size = None):
        uniqueLocs = df['Location'].unique()
        rows = {}
        for loc in uniqueLocs:
            rows[loc] = calculate_distances(loc, finder, size)

        return dfFromDict(rows)

    def transform(df):
        toRet = pd.concat( [info(df, waterFinder),
                    info(df, parkFinder, parkSize)],
                    axis = 'columns')

        return toRet

    parkFinder, parkSize = buildParkDicts()
    waterFinder = buildWaterFinder()

    # Returns DFs: index = locations
    return transform(train), transform(test)

In [80]:
tup2 = LocationProcess(tup[0], tup[1])

In [81]:
tup2[0].shape

(138, 2396)

In [83]:
len([c for c in tup2[1].columns if (c[0] != 'W') & (c[0] != 'P')])

0

In [127]:
def SVD(train, test):

    def find_cols(df, tpe):
        mask = [c for c in df.columns if c[0] == tpe ]

        return df.loc[:,mask]

    def yeildFitTSVD(df):
        comps = 4

        TSVD = TruncatedSVD(n_components = comps)
        TSVD.fit(df)

        return TSVD

    def transformTSVD(df, TSVD):
        toRet = TSVD.transform(df)
        toRet = pd.DataFrame(toRet, index = df.index)
        #print(toRet)
        return toRet

    
    toRetTrain = []
    toRetTest = []
    for t in ['W', 'P']:
        sTrain = find_cols(train, t)
        sTest = find_cols(test,t)
        sTSVD = yeildFitTSVD(sTrain)

        toRetTrain.append(transformTSVD(sTrain, sTSVD))
        toRetTest.append(transformTSVD(sTest, sTSVD))

    toRetTrain = pd.concat(toRetTrain, axis = 'columns')
    toRetTest = pd.concat(toRetTest, axis = 'columns')

    return toRetTrain, toRetTest


In [92]:
tup2[0].loc[:,[c for c in tup2[0].columns if c[0] == 'W']]

Unnamed: 0,W 0,W 1,W 2,W 3,W 4,W 5,W 6,W 7,W 8,W 9,...,W 595,W 596,W 597,W 598,W 599,W 600,W 601,W 602,W 603,W 604
"(-87.930995, 41.957799)",0.258189,0.273510,0.280411,0.283766,0.291122,0.291177,0.291197,0.291264,0.291516,0.291665,...,0.299097,0.292277,0.292162,0.292374,0.292514,0.292597,0.292728,0.292225,0.280558,0.288188
"(-87.890615, 41.974689)",0.231906,0.246757,0.252666,0.255866,0.262978,0.263047,0.263073,0.263157,0.263437,0.263598,...,0.269351,0.262915,0.262806,0.263074,0.263213,0.263353,0.263483,0.263216,0.252771,0.259516
"(-87.862995, 41.992478000000006)",0.221358,0.235498,0.240353,0.243353,0.250125,0.250207,0.250238,0.250337,0.250640,0.250810,...,0.254836,0.248866,0.248765,0.249085,0.249221,0.249415,0.249542,0.249511,0.240415,0.246194
"(-87.832763, 41.944869)",0.167559,0.182201,0.187844,0.191005,0.198062,0.198133,0.198160,0.198246,0.198531,0.198693,...,0.204270,0.197854,0.197745,0.198017,0.198156,0.198300,0.198429,0.198183,0.187939,0.194524
"(-87.824812, 41.974089)",0.181188,0.194910,0.199303,0.202214,0.208837,0.208923,0.208956,0.209059,0.209371,0.209543,...,0.213020,0.207193,0.207095,0.207429,0.207564,0.207773,0.207900,0.207941,0.199348,0.204757
"(-87.812827, 41.981964000000005)",0.179164,0.192306,0.196046,0.198813,0.205178,0.205272,0.205307,0.205418,0.205740,0.205916,...,0.208375,0.202863,0.202770,0.203132,0.203263,0.203503,0.203626,0.203803,0.196066,0.200867
"(-87.81150600000001, 42.011601)",0.202275,0.214563,0.217380,0.219918,0.225856,0.225959,0.225998,0.226119,0.226454,0.226634,...,0.227503,0.222540,0.222456,0.222857,0.222983,0.223267,0.223385,0.223770,0.217364,0.221238
"(-87.807277, 42.009876)",0.198516,0.210690,0.213402,0.215915,0.221808,0.221912,0.221951,0.222073,0.222409,0.222589,...,0.223320,0.218404,0.218321,0.218725,0.218851,0.219137,0.219255,0.219658,0.213382,0.217163
"(-87.805059, 41.973845000000004)",0.167966,0.181077,0.184810,0.187579,0.193952,0.194046,0.194081,0.194192,0.194513,0.194689,...,0.197218,0.191676,0.191582,0.191942,0.192074,0.192311,0.192435,0.192602,0.184830,0.189651
"(-87.800991, 41.95469)",0.150756,0.164328,0.168614,0.171511,0.178120,0.178207,0.178239,0.178343,0.178655,0.178827,...,0.182355,0.176493,0.176394,0.176726,0.176861,0.177068,0.177195,0.177227,0.168656,0.174036


In [86]:
list(tup2[0].columns)

['W 0',
 'W 1',
 'W 2',
 'W 3',
 'W 4',
 'W 5',
 'W 6',
 'W 7',
 'W 8',
 'W 9',
 'W 10',
 'W 11',
 'W 12',
 'W 13',
 'W 14',
 'W 15',
 'W 16',
 'W 17',
 'W 18',
 'W 19',
 'W 20',
 'W 21',
 'W 22',
 'W 23',
 'W 24',
 'W 25',
 'W 26',
 'W 27',
 'W 28',
 'W 29',
 'W 30',
 'W 31',
 'W 32',
 'W 33',
 'W 34',
 'W 35',
 'W 36',
 'W 37',
 'W 38',
 'W 39',
 'W 40',
 'W 41',
 'W 42',
 'W 43',
 'W 44',
 'W 45',
 'W 46',
 'W 47',
 'W 48',
 'W 49',
 'W 50',
 'W 51',
 'W 52',
 'W 53',
 'W 54',
 'W 55',
 'W 56',
 'W 57',
 'W 58',
 'W 59',
 'W 60',
 'W 61',
 'W 62',
 'W 63',
 'W 64',
 'W 65',
 'W 66',
 'W 67',
 'W 68',
 'W 69',
 'W 70',
 'W 71',
 'W 72',
 'W 73',
 'W 74',
 'W 75',
 'W 76',
 'W 77',
 'W 78',
 'W 79',
 'W 80',
 'W 81',
 'W 82',
 'W 83',
 'W 84',
 'W 85',
 'W 86',
 'W 87',
 'W 88',
 'W 89',
 'W 90',
 'W 91',
 'W 92',
 'W 93',
 'W 94',
 'W 95',
 'W 96',
 'W 97',
 'W 98',
 'W 99',
 'W 100',
 'W 101',
 'W 102',
 'W 103',
 'W 104',
 'W 105',
 'W 106',
 'W 107',
 'W 108',
 'W 109',
 'W 110',


In [128]:
tup3 = SVD(tup2[0], tup2[1])

In [129]:
tup3[0].head()

Unnamed: 0,0,1,2,3,0.1,1.1,2.1,3.1
"(-87.930995, 41.957799)",7.452501,-2.415331,2.122308,-0.353222,15659.729641,3853.46179,3827.455257,4010.159684
"(-87.890615, 41.974689)",6.846056,-2.593422,1.839218,-0.280489,21658.491035,4478.92493,4321.933261,5544.188179
"(-87.862995, 41.992478000000006)",6.625241,-2.65285,1.583163,-0.151807,27853.259897,4836.396191,4414.361346,6739.77145
"(-87.832763, 41.944869)",5.744749,-2.252962,1.059791,-0.23373,36393.923204,6716.770461,7087.101363,12322.940434
"(-87.824812, 41.974089)",5.919241,-2.453215,1.058257,-0.087411,41503.869,6276.862943,5880.418044,11648.645524


In [130]:

def WeatherProcess(train, test):

    def yeildWeather(target):
        weather = pd.read_csv(target)
        weather['Date'] = pd.to_datetime(weather['Date'])

        toDrop = ['Depart', 'Depth','Water1',
                'SnowFall', 'CodeSum', 'Heat',
                'Cool', 'Sunrise']
        weather = weather.drop(toDrop, axis=1)

        toReplace = {'M':np.nan, '  T': 0.001, '-': '0000'}
        for k in toReplace:
            weather = weather.replace(k, toReplace[k])


        toFloats = ['Tavg', 'WetBulb', 'PrecipTotal','StnPressure',
                    'SeaLevel', 'ResultSpeed','AvgSpeed']
        for c in toFloats:
            weather[c] = weather[c].astype(float)

        weather['Sunset'] = [date
                            if date[-2:] != '60'
                            else str(int(date[0:2])+1)+'00'
                            for date in weather['Sunset']]

        weather['Sunset'] = pd.to_datetime(weather['Sunset'],
                                            format="%H%M")
        weather.dropna(inplace=True)

        return weather[weather['Station']== 1]

    def yeildAvgTemp(weather):
        weather['Wk'] = weather['Date'].dt.week
        weekTemp = pd.DataFrame(
                        weather.groupby('Wk')['Tavg'].mean())
        weekTemp['Week'] = weekTemp.index - 17
        weekTemp['Week^2'] = weekTemp['Week']**2

        lr = LinearRegression().fit(weekTemp.drop('Tavg', axis = 'columns'),
                                    weekTemp['Tavg'])
        toRet = {}
        for w in range(1,53):
            toRet[w] = lr.intercept_ + (lr.coef_[0]*(w-17)) + (lr.coef_[1] * ((w-17)**2))

        return toRet

    def calculate_agregate( weather_sub, avgTDict):
        toRet = pd.Series()

        allAgg = [np.max, np.min, np.mean]
        toAgg = {'DewPoint': allAgg,
                'StnPressure': allAgg,
                'AvgSpeed': allAgg,
                'Tmax':[np.max],
                'Tmin':[np.min],
                'Tavg':[np.mean],
                'PrecipTotal':[np.sum, np.mean]
                }
        for k in toAgg:
            for f in toAgg[k]:
                toRet.loc[k + str(f).split(' ')[1]] = f(weather_sub[k])

        finalEntry = weather_sub.iloc[len(weather_sub)-1]

        toRet['temp_expected'] = avgTDict[pd.to_datetime(finalEntry['Date']).week]
        toRet['temp_diff'] = toRet['Tavgmean'] - toRet['temp_expected']

        sunset = finalEntry['Sunset']
        toRet['sunset'] = sunset.hour + (sunset.minute / 60)

        return toRet

    def date_ranges(dates):
        uniqueYears = set([pd.to_datetime(d).year for d in dates])

        dates = sorted(dates)
        fyear = []
        for y in uniqueYears:
            for d in dates:
                if pd.to_datetime(d).year == y:
                    fyear.append(d)
                    break

        for d in fyear:
            dates = np.insert(dates, 0, d - pd.Timedelta(days = 8))

        dateRanges = []
        for i in range(len(dates)-1):
            if pd.to_datetime(dates[i]).year == pd.to_datetime(dates[i+1]).year:
                dateRanges.append( (dates[i], dates[i+1]) )

        return dateRanges

    def subset_weather(dateRange, weather):
        mask = (weather['Date']>dateRange[0]) & (weather['Date'] <= dateRange[1])
        return weather.loc[mask]

    def TWeatherDFMaker(dct):
        toRet = pd.DataFrame().from_dict(dct)
        toRet = toRet.transpose()
        toRet.index = [idx for idx in toRet.index]
        toRet['Trap'] = toRet.index.map(lambda x: x[0])
        toRet['Date'] = toRet.index.map(lambda x: x[1])
        toRet.index = range(len(toRet))

        return toRet

    def trap_agregator(trap_df, weather, avgTDict):
        trapWeather = {}
        trap = trap_df['Trap'].iloc[0]

        dates = trap_df['Date'].unique()
        dates = sorted(dates)

        dateRanges = date_ranges(dates)

        for dr in dateRanges:
            weather_sub = subset_weather(dr, weather)
            trapWeather[(trap, dr[1])] = calculate_agregate(weather_sub, avgTDict)
        toRet = pd.DataFrame().from_dict(trapWeather)

        return TWeatherDFMaker(trapWeather)

    def transform(df):
        observations = []

        traps = df['Trap'].unique()
        for t in traps:
            observations.append(trap_agregator(df[df['Trap'] == t],
            weather, avgTDict))
        toRet = pd.concat(observations, axis = 'rows')

        return toRet

    weatherTarget = './input/weather.csv'
    weather = yeildWeather(weatherTarget)
    avgTDict = yeildAvgTemp(weather)

    return transform(train), transform(test)

In [132]:
tupw = WeatherProcess(tup[0],tup[1])

In [135]:
tupw[1].head()

Unnamed: 0,DewPointamax,DewPointamin,DewPointmean,StnPressureamax,StnPressureamin,StnPressuremean,AvgSpeedamax,AvgSpeedamin,AvgSpeedmean,Tmaxamax,Tminamin,Tavgmean,PrecipTotalsum,PrecipTotalmean,temp_expected,temp_diff,sunset,Trap,Date
0,71.0,56.0,63.75,29.28,28.9,29.1,21.5,5.2,11.675,87.0,54.0,73.125,2.133,0.266625,70.887919,2.237081,19.433333,T002,2008-06-11
1,63.0,46.0,55.5,29.26,29.15,29.208333,16.3,8.8,11.2,91.0,55.0,71.833333,0.57,0.095,72.191174,-0.357841,19.483333,T002,2008-06-17
2,54.0,46.0,51.142857,29.44,29.18,29.265714,7.9,5.1,6.485714,82.0,52.0,68.857143,0.26,0.037143,73.228946,-4.371803,19.516667,T002,2008-06-24
3,67.0,50.0,58.714286,29.28,28.98,29.151429,13.2,7.3,9.214286,86.0,58.0,72.714286,0.701,0.100143,74.001234,-1.286948,19.516667,T002,2008-07-01
4,62.0,50.0,54.333333,29.35,29.09,29.26,13.2,6.6,10.6,87.0,52.0,67.333333,0.201,0.067,74.001234,-6.6679,19.5,T002,2008-07-04
