Use function *multi_merge(train,test,list_of_lag_days,list_of_feature_lists)* to create variations of our datasets.

---

# Imports

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import haversine as hv
from datetime import timedelta

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, roc_auc_score

In [2]:
# set station coordinates
STATIONS = {
    1 : (41.995,-87.933),
    2 : (41.786,-87.752)
}

def nearest_station(in_coords):
    
    dist = {k:hv.haversine(in_coords,v) for k,v in STATIONS.items()}
    
    return min(dist, key=dist.get)

In [23]:
# import train,test data and map nearest weather station
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\test.csv')
train['nearest_station'] = train.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
test['nearest_station'] = test.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
train.Date = train.Date.astype('datetime64[ns]')
test.Date = test.Date.astype('datetime64[ns]')

In [24]:
# import weather data and convert date type
weather = pd.read_csv(r'.\data\weather_cleaned_stack_back.csv')
weather.drop(columns='Unnamed: 0',inplace=True)
weather.Date = weather.Date.astype('datetime64[ns]')

In [25]:
train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'nearest_station'],
      dtype='object')

---
# Munging weather data

In [26]:
temp_list=['Tmax','Tmin','Tavg','Depart','DewPoint','WetBulb','Cool']
rain_list=['PrecipTotal']
day_list=['Sunset','DaylightHrs','StnPressure','SeaLevel','ResultSpeed',
          'ResultDir','AvgSpeed']

In [27]:
# function to merge weather features based on date lag
def to_merge(train,test,lag,feature_list):
    train['date_lag'] = train.Date.map(lambda x : x - timedelta(days=lag))
    test['date_lag'] = test.Date.map(lambda x : x - timedelta(days=lag))
    feature_list=feature_list+['Date','Station']
    train_weather = train.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    test_weather = test.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    train_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    train_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    test_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    test_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    return(train_weather,test_weather)

In [28]:
# function to merge multiple weather features with different date lags
def multi_merge(train,test,lag_list,list_of_lists):
    if len(lag_list)!=len(list_of_lists):
        print('Mismatch in list lengths')
        return None
    else:
        for i in range(len(lag_list)):
            train,test=to_merge(train,test,lag_list[i],list_of_lists[i])
        train['month'] = train.Date.map(lambda x : x.month)
        test['month'] = test.Date.map(lambda x : x.month)
        return(train,test)

In [8]:
lag_list=[0,3,11]
feat_list=[day_list,temp_list,rain_list]
train_1,test_1=multi_merge(train,test,lag_list,feat_list)

In [43]:
cols = ['month','Species','Sunset', 'Street',
       'DaylightHrs', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed','WnvPresent']
# NumMosquitos removed as this feature is not present in the test set

In [10]:
train_1 = train_1[cols]

---
# Preparation of data

In [68]:
scaler = StandardScaler()
sm = SMOTE(sampling_strategy=1,random_state=666)

train_dummies = pd.get_dummies(train_1,drop_first=True,columns=['Species','Street'])
y = train_dummies['WnvPresent']
X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
train_x=scaler.fit_transform(train_x)
test_x=scaler.transform(test_x)
sampledX,sampledy = sm.fit_sample(train_x,train_y)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # Remove the CWD from sys.path while we load stuff.


In [70]:
# test fit and predict, benchmark
lr=LogisticRegression(solver='liblinear')
lr.fit(sampledX,sampledy)
pred=lr.predict(test_x)
recall_score(test_y,pred)

0.6484848484848484

---
# Testing different ranges

## &nbsp;&nbsp;&nbsp;&nbsp;With recall scoring

In [78]:
def recscore_iter(train,test,lag_list,feature_list):
    train_1,test_1=multi_merge(train,test,lag_list,feat_list)
    train_1 = train_1[cols]
    scaler = StandardScaler()
    sm = SMOTE(sampling_strategy=1,random_state=666)

    train_dummies = pd.get_dummies(train_1,drop_first=True,columns=['Species','Street'])
    train_dummies=train_dummies.astype('float64')
    y = train_dummies['WnvPresent']
    X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

    train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
    train_x=scaler.fit_transform(train_x)
    test_x=scaler.transform(test_x)
    sampledX,sampledy = sm.fit_sample(train_x,train_y)
    
    lr=LogisticRegression(solver='liblinear')
    lr.fit(sampledX,sampledy)
    pred=lr.predict(test_x)
    return (recall_score(test_y,pred))

In [79]:
for i in range(1,8):
    lag_list=[0,i,11]
    print('temp days recall ' + str(i) + ': ' + str(recscore_iter(train,test,lag_list,feat_list)))

temp days recall 1: 0.6787878787878788
temp days recall 2: 0.696969696969697
temp days recall 3: 0.6727272727272727
temp days recall 4: 0.6787878787878788
temp days recall 5: 0.6787878787878788
temp days recall 6: 0.6666666666666666
temp days recall 7: 0.6727272727272727


In [80]:
for i in range (1,15):
    lag_list=[0,4,i]
    print('rain day recall '+str(i) + ': ' + str(recscore_iter(train,test,lag_list,feat_list)))

rain day recall 1: 0.6848484848484848
rain day recall 2: 0.6787878787878788
rain day recall 3: 0.6909090909090909
rain day recall 4: 0.6727272727272727
rain day recall 5: 0.703030303030303
rain day recall 6: 0.6848484848484848
rain day recall 7: 0.696969696969697
rain day recall 8: 0.6787878787878788
rain day recall 9: 0.6848484848484848
rain day recall 10: 0.6787878787878788
rain day recall 11: 0.6787878787878788
rain day recall 12: 0.6848484848484848
rain day recall 13: 0.6787878787878788
rain day recall 14: 0.6848484848484848


## &nbsp;&nbsp;&nbsp;&nbsp;With ROC_AUC scoring

In [75]:
def rocscore_iter(train,test,lag_list,feature_list):
    train_1,test_1=multi_merge(train,test,lag_list,feat_list)
    train_1 = train_1[cols]
    scaler = StandardScaler()
    sm = SMOTE(sampling_strategy=1,random_state=666)

    train_dummies = pd.get_dummies(train_1,drop_first=True,columns=['Species','Street'])
    train_dummies=train_dummies.astype('float64')
    y = train_dummies['WnvPresent']
    X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

    train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
    train_x=scaler.fit_transform(train_x)
    test_x=scaler.transform(test_x)
    sampledX,sampledy = sm.fit_sample(train_x,train_y)
    
    lr=LogisticRegression(solver='liblinear')
    lr.fit(sampledX,sampledy)
    pred=lr.predict_proba(test_x)
    return (roc_auc_score(test_y,pd.DataFrame(pred)[1].array))

In [81]:
for i in range(1,8):
    lag_list=[0,i,11]
    print('temp days roc_auc ' + str(i) + ': ' + str(rocscore_iter(train,test,lag_list,feat_list)))

temp days roc_auc 1: 0.7750636596970711
temp days roc_auc 2: 0.7818405007558004
temp days roc_auc 3: 0.775329457954165
temp days roc_auc 4: 0.7755790242566272
temp days roc_auc 5: 0.7739172779012082
temp days roc_auc 6: 0.7743271347556584
temp days roc_auc 7: 0.7636140446987452


In [82]:
for i in range (1,15):
    lag_list=[0,4,i]
    print('rain day roc_auc '+str(i) + ': ' + str(rocscore_iter(train,test,lag_list,feat_list)))

rain day roc_auc 1: 0.7762262734475657
rain day roc_auc 2: 0.7753923567783627
rain day roc_auc 3: 0.7760456929522881
rain day roc_auc 4: 0.7767071451035293
rain day roc_auc 5: 0.7749074271337412
rain day roc_auc 6: 0.7762404764078684
rain day roc_auc 7: 0.7793651276744681
rain day roc_auc 8: 0.775434965659271
rain day roc_auc 9: 0.7771129439693216
rain day roc_auc 10: 0.7758610544683529
rain day roc_auc 11: 0.7755790242566272
rain day roc_auc 12: 0.776210041492934
rain day roc_auc 13: 0.7751062685779793
rain day roc_auc 14: 0.7769526534173337


---
For Kaggle purposes, we should optimise roc_auc score and we use a temp lag of 2 days and a rain lag of 7 days.<br/>
For modelling purposes in the business case, we should optimise recall score and we use a temp lag of 2 days and a rain lag of 5 days.

In [83]:
testcols=cols.copy()
testcols.remove('WnvPresent')

In [86]:
# We export our datasets for Kaggle modelling
lag_list=[0,2,7]
feat_list=[day_list,temp_list,rain_list]
train_1,test_1=multi_merge(train,test,lag_list,feat_list)

train_1 = train_1[cols]
test_1 = test_1[testcols]

train_1.to_csv(r'.\data\train_lag_Kaggle.csv')
test_1.to_csv(r'.\data\test_lag_Kaggle.csv')

In [87]:
# We export our datasets for business modelling
lag_list=[0,2,5]
feat_list=[day_list,temp_list,rain_list]
train_1,test_1=multi_merge(train,test,lag_list,feat_list)

train_1 = train_1[cols]
test_1 = test_1[testcols]

train_1.to_csv(r'.\data\train_lag_biz.csv')
test_1.to_csv(r'.\data\test_lag_biz.csv')