In [163]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [142]:
train = pd.read_csv('../datasets/train.csv')
weather = pd.read_csv('../datasets/weather.csv')
test = pd.read_csv('../datasets/test.csv')

In [174]:
# convert dates to date time
train['Date'] = pd.to_datetime(train['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# convert to numeric
test['Longitude'] = pd.to_numeric(test['Longitude'])
test['Latitude'] = pd.to_numeric(test['Latitude'])

KeyError: 'Date'

In [None]:
# make date weather index
weather.set_index('Date', inplace=True)

In [None]:
# filter for station 1
mask = (weather['Station'] == 1)
weather1 = weather[mask]

In [None]:
precip_list = []
for i in weather1['PrecipTotal']:
    if i != '  T':
        precip_list.append(float(i))
    else:
        precip_list.append(float('0.0'))

In [None]:
weather_subset = pd.DataFrame(columns = ['Date','total_precip'])
weather_subset['total_precip']  = precip_list
weather_subset['Date'] = weather1.index
weather_subset.set_index('Date', inplace=True)

In [None]:
#resample to get weekly precipitation and make into a df
weekly_precip = weather_subset.resample('W').mean()
weekly_df = pd.DataFrame(columns = ['Date', 'PavgW'])
weekly_df['Date'] = weekly_precip.index
weekly_df['PavgW'] = weekly_precip.values

In [None]:
# get range of dates in order to join to the weekly df
end = max(weather1.index)
start = min(weather1.index)
new_df = pd.DataFrame(columns = ['Date'])
new_df['Date'] = pd.bdate_range(start, end, freq = 'D')
#backfill
precip = new_df.merge(weekly_df, on = 'Date', how = 'left')
precip = precip.fillna(method = 'bfill', limit = 6)
precip.head()

In [150]:
# merge precip with test
test = test.merge(precip, on = 'Date', how = 'left')

# Get Avg Temp

In [151]:
# resample 
weekly_weather = weather1.Tmax.resample('W').mean()
weekly_temp = pd.DataFrame(columns = ['Date', 'TmaxW'])
weekly_temp['Date'] = weekly_weather.index
weekly_temp['TmaxW'] = weekly_weather.values
end = max(weather1.index)
start = min(weather1.index)
new_df = pd.DataFrame(columns = ['Date'])
new_df['Date'] = pd.bdate_range(start, end, freq = 'D')

In [152]:
temps = new_df.merge(weekly_temp, on = 'Date', how = 'left')
temps = temps.fillna(method = 'bfill', limit = 6)

In [153]:
# merge temps with test
test = test.merge(temps, on = 'Date', how = 'left')

In [154]:
test.head(1)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,PavgW,TmaxW
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0.101429,82.428571


In [155]:
test.count()

Id                        116293
Date                      116293
Address                   116293
Species                   116293
Block                     116293
Street                    116293
Trap                      116293
AddressNumberAndStreet    116293
Latitude                  116293
Longitude                 116293
AddressAccuracy           116293
PavgW                     116293
TmaxW                     116293
dtype: int64

# Model

In [156]:
train_data = pd.read_csv('../datasets/train_clean.csv')

In [167]:
clean_data = train_data[[u'Latitude', u'Longitude', u'PavgW', u'TmaxW',
    u'Species_CULEX ERRATICUS', u'Species_CULEX PIPIENS',
       u'Species_CULEX PIPIENS/RESTUANS', u'Species_CULEX RESTUANS',
       u'Species_CULEX SALINARIUS', u'Species_CULEX TARSALIS',
       u'Species_CULEX TERRITANS']]

In [168]:
X_train, X_test, y_train, y_test = train_test_split(clean_data, train_data.WnvPresent)

In [170]:
# try a gridsearch for random forest classifier
params = {'n_estimators': [5,10,25,100,200,500],
          'max_depth': [None, 3, 5, 10],
          'max_features': [0.25,0.5,0.75,1.0]}

gsrfc = GridSearchCV(RandomForestClassifier(),
                    params, n_jobs=-1,
                    cv=KFold(len(y_train), n_folds=3, shuffle=True),
                   scoring = 'accuracy')
gsrfc.fit(X_train, y_train)
accuracy = np.mean(cross_val_score(gsrfc.best_estimator_, X_test, y_test, cv = 10, scoring = 'accuracy'))

In [171]:
accuracy

0.9459448708781133

In [135]:
# fit the model
logreg = LogisticRegressionCV(cv = 5)
logreg.fit(train_X, y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

# Test Data

In [179]:
test = pd.read_csv('../datasets/test_clean.csv')
del test['Unnamed: 0']

In [180]:
test.head()

Unnamed: 0,Latitude,Longitude,PavgW,TmaxW,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS
0,41.95469,-87.800991,0.101429,82.428571,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,41.95469,-87.800991,0.101429,82.428571,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,41.95469,-87.800991,0.101429,82.428571,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,41.95469,-87.800991,0.101429,82.428571,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,41.95469,-87.800991,0.101429,82.428571,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [172]:
X = clean_data
y = train_data.WnvPresent

In [173]:
# try a gridsearch for random forest classifier
params = {'n_estimators': [5,10,25,100,200,500],
          'max_depth': [None, 3, 5, 10],
          'max_features': [0.25,0.5,0.75,1.0]}

gsrfc = GridSearchCV(RandomForestClassifier(),
                    params, n_jobs=-1,
                    cv=KFold(len(y_train), n_folds=3, shuffle=True),
                   scoring = 'accuracy')
gsrfc.fit(X, y)
best_rf = gsrfc.best_estimator_

In [181]:
predictions = gsrfc.predict(test)

test_data = test[['Trap','Date','Species','Latitude','Longitude','PavgW','TmaxW']]

test_dummies = pd.get_dummies(test_data, columns = ['Species'])

X = test_dummies[[u'Latitude', u'Longitude', u'PavgW', u'TmaxW',
    u'Species_CULEX ERRATICUS', u'Species_CULEX PIPIENS',
       u'Species_CULEX PIPIENS/RESTUANS', u'Species_CULEX RESTUANS',
       u'Species_CULEX SALINARIUS', u'Species_CULEX TARSALIS',
       u'Species_CULEX TERRITANS']]

In [102]:
#grouped_data = dummies.groupby(['Trap','Date','Latitude','Longitude','PavgW','TmaxW'], as_index = False).sum()

In [103]:
#for col in [u'Species_CULEX ERRATICUS',
       #u'Species_CULEX PIPIENS', u'Species_CULEX PIPIENS/RESTUANS',
       #u'Species_CULEX RESTUANS', u'Species_CULEX SALINARIUS',
       #u'Species_CULEX TARSALIS', u'Species_CULEX TERRITANS']:
    #grouped_data[col] = grouped_data[col].apply(lambda x: 0 if x == 0.0 else 1)

In [183]:
len(predictions)

116293

In [185]:
range(1,5)

[1, 2, 3, 4]

In [136]:
# make predictions with X
#predictions = logreg.predict(X)

In [186]:
submission = pd.DataFrame(columns = ['Id','WnvPresent'])
submission['Id'] = range(1, 116294)
submission['WnvPresent'] = predictions

In [187]:
submission.to_csv('../datasets/rfc_submission.csv')

In [None]:
len(X)