## Kaggle Competition - WestNile 

- data preparation 
- copied from https://www.kaggle.com/molozhenko/simple-lasagne-nnb

In [3]:
from __future__ import print_function
import numpy as np
import datetime
import csv

species_map = {'CULEX RESTUANS' : "100000",
              'CULEX TERRITANS' : "010000", 
              'CULEX PIPIENS'   : "001000", 
              'CULEX PIPIENS/RESTUANS' : "101000", 
              'CULEX ERRATICUS' : "000100", 
              'CULEX SALINARIUS': "000010", 
              'CULEX TARSALIS' :  "000001",
              'UNSPECIFIED CULEX': "001000"} # Treating unspecified as PIPIENS (http://www.ajtmh.org/content/80/2/268.full)

def date(text):
    return datetime.datetime.strptime(text, "%Y-%m-%d").date()
    
def precip(text):
    TRACE = 1e-3
    text = text.strip()
    if text == "M":
        return None
    if text == "-":
        return None
    if text == "T":
        return TRACE
    return float(text)

def impute_missing_weather_station_values(weather):
    # Stupid simple
    for k, v in weather.items():
        if v[0] is None:
            v[0] = v[1]
        elif v[1] is None:
            v[1] = v[0]
        for k1 in v[0]:
            if v[0][k1] is None:
                v[0][k1] = v[1][k1]
        for k1 in v[1]:
            if v[1][k1] is None:
                v[1][k1] = v[0][k1]
    
def load_weather():
    weather = {}
    for line in csv.DictReader(open("./data/westnile/weather.csv")):
        for name, converter in {"Date" : date,
                                "Tmax" : float,"Tmin" : float,"Tavg" : float,
                                "DewPoint" : float, "WetBulb" : float,
                                "PrecipTotal" : precip,"Sunrise" : precip,"Sunset" : precip,
                                "Depart" : float, "Heat" : precip,"Cool" : precip,
                                "ResultSpeed" : float,"ResultDir" : float,"AvgSpeed" : float,
                                "StnPressure" : float, "SeaLevel" : float}.items():
            x = line[name].strip()
            line[name] = converter(x) if (x != "M") else None
        station = int(line["Station"]) - 1
        assert station in [0,1]
        dt = line["Date"]
        if dt not in weather:
            weather[dt] = [None, None]
        assert weather[dt][station] is None, "duplicate weather reading {0}:{1}".format(dt, station)
        weather[dt][station] = line
    impute_missing_weather_station_values(weather)        
    return weather
    
    
def load_training():
    training = []
    for line in csv.DictReader(open("./data/westnile/train.csv")):
        for name, converter in {"Date" : date, 
                                "Latitude" : float, "Longitude" : float,
                                "NumMosquitos" : int, "WnvPresent" : int}.items():
            line[name] = converter(line[name])
        training.append(line)
    return training
    
def load_testing():
    training = []
    for line in csv.DictReader(open("./data/westnile/test.csv")):
        for name, converter in {"Date" : date, 
                                "Latitude" : float, "Longitude" : float}.items():
            line[name] = converter(line[name])
        training.append(line)
    return training
    
    
def closest_station(lat, longi):
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, longi])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2)
       
def normalize(X, mean=None, std=None):
    count = X.shape[1]
    if mean is None:
        mean = np.nanmean(X, axis=0)
    for i in range(count):
        X[np.isnan(X[:,i]), i] = mean[i]
    if std is None:
        std = np.std(X, axis=0)
    for i in range(count):
        X[:,i] = (X[:,i] - mean[i]) / std[i]
    return mean, std
    
def scaled_count(record):
    SCALE = 9.0
    if "NumMosquitos" not in record:
        # This is test data
        return 1
    return int(np.ceil(record["NumMosquitos"] / SCALE))
    
    
def assemble_X(base, weather):
    X = []
    for b in base:
        date = b["Date"]
        lat, longi = b["Latitude"], b["Longitude"]
        case = [date.year, date.month, date.day, date.weekday(), lat, longi]
        # Look at a selection of past weather values
        for days_ago in [0,1,3,5,8,12]:
            day = date - datetime.timedelta(days=days_ago)
            for obs in ["Tmax","Tmin","Tavg","DewPoint","WetBulb","PrecipTotal","Depart","Sunrise","Sunset","Cool","ResultSpeed","ResultDir"]:
                station = closest_station(lat, longi)
                case.append(weather[day][station][obs])
        # Specify which mosquitos are present
        species_vector = [float(x) for x in species_map[b["Species"]]]
        case.extend(species_vector)
        # Weight each observation by the number of mosquitos seen. Test data
        # Doesn't have this column, so in that case use 1. This accidentally
        # Takes into account multiple entries that result from >50 mosquitos
        # on one day. 
        for repeat in range(scaled_count(b)):
            X.append(case)    
    X = np.asarray(X, dtype=np.float32)
    return X
    
def assemble_y(base):
    y = []
    for b in base:
        present = b["WnvPresent"]
        for repeat in range(scaled_count(b)):
            y.append(present)    
    return np.asarray(y, dtype=np.int32).reshape(-1,1)


class AdjustVariable(object):
    def __init__(self, variable, target, half_life=20):
        self.variable = variable
        self.target = target
        self.half_life = half_life
    def __call__(self, nn, train_history):
        delta = self.variable.get_value() - self.target
        delta /= 2**(1.0/self.half_life)
        self.variable.set_value(np.float32(self.target + delta))

In [29]:
weather = load_weather()
training = load_training()
    
X = assemble_X(training, weather)
mean, std = normalize(X)
y = assemble_y(training)

In [30]:
import pandas as pd

In [41]:
westnile_data = pd.DataFrame(X)
westnile_data.columns = [ 'X'+str(i+1) for i in range(84) ]
westnile_data['y'] = factor_y

westnile_data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X76,X77,X78,X79,X80,X81,X82,X83,X84,y
0,-0.906075,-2.709929,1.541163,-1.155984,1.05517,-0.994872,0.657944,-0.752451,-0.055497,-0.352568,...,-1.392641,1.332225,-0.948726,0.690327,-0.106183,0.53013,-0.006858,-0.06448,-0.016803,No
1,-0.906075,-2.709929,1.541163,-1.155984,1.05517,-0.994872,0.657944,-0.752451,-0.055497,-0.352568,...,-1.392641,1.332225,-0.948726,0.690327,-0.106183,-1.886373,-0.006858,-0.06448,-0.016803,No
2,-0.906075,-2.709929,1.541163,-1.155984,1.386389,-0.69833,0.657944,-0.752451,-0.055497,-0.352568,...,-1.392641,1.332225,-0.948726,0.690327,-0.106183,-1.886373,-0.006858,-0.06448,-0.016803,No
3,-0.906075,-2.709929,1.541163,-1.155984,1.214619,-1.217707,0.657944,-0.752451,-0.055497,-0.352568,...,-1.392641,1.332225,-0.948726,0.690327,-0.106183,0.53013,-0.006858,-0.06448,-0.016803,No
4,-0.906075,-2.709929,1.541163,-1.155984,1.214619,-1.217707,0.657944,-0.752451,-0.055497,-0.352568,...,-1.392641,1.332225,-0.948726,0.690327,-0.106183,-1.886373,-0.006858,-0.06448,-0.016803,No


In [42]:
westnile_data.to_csv('./data/data-westnile.csv', index=False)

In [40]:
factor_y = []
for i in y:
    if i == 0:
        factor_y.append('No')
    else:
        factor_y.append('Yes')
len(factor_y)

21260

In [39]:
len(factor_y)

19387