In [58]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import datetime as dt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegressionCV
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_selection import chi2
%matplotlib inline

In [17]:

# Load dataset 
train = pd.read_csv('asset/train.csv')
test = pd.read_csv('asset/test.csv')
sample = pd.read_csv('asset/sampleSubmission.csv')
weather = pd.read_csv('asset/weather.csv')
spray = pd.read_csv('asset/spray.csv')

# Get labels
labels = train.WnvPresent.values

# Creating dummy variables for the weather data
weather.CodeSum = weather.CodeSum.apply(str.split)
for (i,list_) in enumerate(weather.CodeSum):
    for item in list_:
        if item not in weather.columns:
            weather[item] = 0
        weather.set_value(col=item,index=i,value=1)


# Not using codesum anymore
weather = weather.drop('CodeSum', axis=1)
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)
weather.Date = pd.to_datetime(weather.Date)
spray.Date = pd.to_datetime(spray.Date)

In [18]:
# for c in weather.columns:
#     print(c)
#     print (weather[c].value_counts())

In [19]:
# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [20]:
# Functions to extract month and day from dataset
train['year'] = train['Date'].dt.year
train['month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day

# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

In [21]:
# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

In [22]:
# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')

# train = train.merge(spray, on='Date')
# test = test.merge(spray, on='Date')

In [23]:
# This function allows you to create columns for the data_df input that will have a value of 1 or zero
# If a trap has been sprayed with in the time period specified

def create_sprayed_cols(data_df, spray_df, time_period=2):
    # Iterating over unique dates that sprays took place
    # Sprays took place over 10 days as trucks drove around chicago
    for date in set(spray_df.Date):
        # I only want data for this unique date
        spray_temp = spray_df[spray_df.Date == date]
        # Resetting index to make iterating easier
        spray_temp.index = range(0, len(spray_temp))
        
        # I am creating a column for every unique date and initalizing it's rows to 0
        # I will set these values to 1 when I find a trap that was sprayed
        col_name = 'spray_'+date.strftime('%Y-%m-%d')+"_"+str(time_period)
        data_df[col_name] = 0

        # Iterating over each row of our training data to determine if a trap is in the location
        # of a spray. I am also checking to see if the spray was in the past
        for r in range(0,len(data_df)):
            if data_df.get_value(r,'Date') > date and data_df.get_value(r,'Date') < date + pd.Timedelta(weeks=2) :

                # I am casting the lat and long to ints, and multiplaying by 100 to truncate precision
                # In other words, I'm taking pin points and making them into squares
                cur_lat = int(data_df.get_value(r, 'Latitude') * 100)
                cur_long = int(data_df.get_value(r, 'Longitude') * 100)
                
                # Iterating over each value in my spray data
                for i in range(0, len(spray_temp)):

                    spray_lat = int(spray_temp.get_value(i,'Latitude')*100)
                    spray_long = int(spray_temp.get_value(i,'Longitude')*100)

                    # I am now checking if something is in the square +/- some threshold
                    if (cur_lat < spray_lat + 10 and cur_lat > spray_lat - 10) and \
                    (cur_long < spray_long + 10 and cur_long > spray_long - 10):
                        data_df.set_value(r,col_name, 1)
                        break

In [24]:
create_sprayed_cols(train, spray, time_period=10)
create_sprayed_cols(test,spray, time_period=10)

In [25]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

In [26]:
# drop columns with -1s
train = train.loc[:,(train != -1).any(axis=0)]
test = test.loc[:,(test != -1).any(axis=0)]

In [27]:
train = train.drop('Date', axis=1)
test = test.drop('Date', axis=1)

In [28]:
_ = train.convert_objects(convert_numeric=True)
_ = test.convert_objects(convert_numeric=True)

  """Entry point for launching an IPython kernel.
  


# Grid searching below

In [48]:
param_grid = {'Cs': [[0.001, 0.01, 0.1, 1, 10, 100, 1000]]}
grid = model_selection.GridSearchCV(LogisticRegressionCV(penalty='l2'), param_grid)

In [49]:
# # Random Forest Classifier 
#grid = model_selection.GridSearchCV(estimator=clf, scoring='roc_auc', param_grid=dict(n_estimators=n_estimators,
#                                                     max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split))

grid.fit(train,labels)


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'Cs': [[0.001, 0.01, 0.1, 1, 10, 100, 1000]]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

### Evaluating results

In [50]:
grid.best_estimator_.fit(train, labels)

# create predictions and submission file
predictions = grid.best_estimator_.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('beat_the_benchmark_1.csv', index=False)

In [56]:
grid.best_score_

0.94755377879307068

In [52]:
sum(labels)/float(len(labels))

0.052446221206929371

In [53]:
np.mean(model_selection.cross_val_score(grid.best_estimator_, X=train, y=labels, scoring='roc_auc', n_jobs=7))

0.63703110716832378

In [54]:
model = grid.best_estimator_


#### Looking at feature importances

In [65]:
model.fit(train, labels)
# scores, pvalues = chi2(train, labels)
# features = pd.DataFrame(data=zip(np.exp((model.coef_[0])),pvalues), index=train.columns, columns=['odds_ratio','p_value'])

LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000],
           class_weight=None, cv=None, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [68]:
zip(train.columns, np.exp(model.coef_[0]))

[('Species', 0.92269670221409106),
 ('Block', 1.0026856009057483),
 ('Street', 1.0007953148598043),
 ('Trap', 1.0017785300695485),
 ('Latitude', 1.0020476981428281),
 ('Longitude', 1.0015914342659371),
 ('AddressAccuracy', 1.0493293370176857),
 ('year', 0.98081006525984393),
 ('month', 1.0137340395187042),
 ('day', 0.99528162606182813),
 ('Lat_int', 0.99765402279766591),
 ('Long_int', 1.0111537650027262),
 ('Tmax_x', 1.0269948382207716),
 ('Tmin_x', 1.0204629454065386),
 ('Tavg_x', 1.03598526745623),
 ('Depart_x', 0.85439404925950913),
 ('DewPoint_x', 1.0428508813629045),
 ('WetBulb_x', 1.0484486443339385),
 ('Heat_x', 0.96102979844793668),
 ('Cool_x', 1.0038977787759948),
 ('Sunrise_x', 1.0308997661494059),
 ('Sunset_x', 1.0056944337595102),
 ('Depth_x', 1.0),
 ('SnowFall_x', 1.0016490269063452),
 ('PrecipTotal_x', 0.98907840465797514),
 ('StnPressure_x', 1.0026933460743592),
 ('SeaLevel_x', 0.99706827261126052),
 ('ResultSpeed_x', 0.97414676239685827),
 ('ResultDir_x', 1.029200474296

In [64]:
train.sum()

Species                                                            22129
Block                                                             374936
Street                                                            730648
Trap                                                              816595
Latitude                                                          439583
Longitude                                                        -921375
AddressAccuracy                                                    82152
year                                                            21112608
month                                                              80818
day                                                               161177
Lat_int                                                           431155
Long_int                                                         -914022
Tmax_x                                                            859895
Tmin_x                                             