In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from imblearn import under_sampling, over_sampling
from operator import itemgetter

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

%matplotlib inline

### Train Data Import

In [2]:
trainDf = pd.read_csv('trainSprayCombinedWeather.csv')

In [3]:
testDf = pd.read_csv('testDf.csv')

In [4]:
trainDf.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Most_Recent_Spray,Recently_Sprayed,...,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Weather,Day_of_Month,Month,Year,Day_of_Week
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.955,-87.801,9,1,0,3650.0,0,...,29.39,30.11,5.8,18,6.5,coldAndWet,29,5,2007,1
1,2007-05-29,CULEX RESTUANS,T002,41.955,-87.801,9,1,0,3650.0,0,...,29.39,30.11,5.8,18,6.5,coldAndWet,29,5,2007,1
2,2007-05-29,CULEX RESTUANS,T007,41.995,-87.769,9,1,0,3650.0,0,...,29.39,30.11,5.8,18,6.5,coldAndWet,29,5,2007,1
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974,-87.825,8,1,0,3650.0,0,...,29.39,30.11,5.8,18,6.5,coldAndWet,29,5,2007,1
4,2007-05-29,CULEX RESTUANS,T015,41.974,-87.825,8,4,0,3650.0,0,...,29.39,30.11,5.8,18,6.5,coldAndWet,29,5,2007,1


In [5]:
trainDf.columns

Index(['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Most_Recent_Spray', 'Recently_Sprayed',
       'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat', 'Cool',
       'CodeSum', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed', 'Weather', 'Day_of_Month', 'Month', 'Year',
       'Day_of_Week'],
      dtype='object')

In [6]:
trainDf.shape

(10506, 29)

In [7]:
trainDf.drop(columns=['Date', 'Species', 'Trap', 'AddressAccuracy', 'CodeSum', 'Day_of_Month','Month', 'Year',
       'Day_of_Week'], axis =1, inplace =True)

In [8]:
trainDf.shape

(10506, 20)

In [9]:
trainDf.columns

Index(['Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent',
       'Most_Recent_Spray', 'Recently_Sprayed', 'Tmax', 'Tmin', 'Tavg',
       'DewPoint', 'WetBulb', 'Heat', 'Cool', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Weather'],
      dtype='object')

In [10]:
testDf.Weather.unique()

array(['normal', 'coldAndWet', 'hotAndDry'], dtype=object)

In [11]:
new_values = {'normal': 0, 'coldAndWet': 1, 'hotAndDry': 2}
trainDf.Weather = trainDf.Weather.map(new_values)

In [12]:
testDf.Weather.unique()

array(['normal', 'coldAndWet', 'hotAndDry'], dtype=object)

### Test Data Import

In [13]:
testDf = pd.read_csv('testDf.csv')

In [14]:
testDf.head()

Unnamed: 0,Id,Date,Species,Latitude,Longitude,AddressAccuracy,Most_Recent_Spray,Recently_Sprayed,Tmax,Tmin,...,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Weather,Day_of_Month,Month,Year,Day_of_Week
0,1,2008-06-11,CULEX PIPIENS/RESTUANS,41.955,-87.801,9,3650.0,0,86,61,...,29.28,29.99,8.9,18,10.0,normal,11,6,2008,2
1,2,2008-06-11,CULEX RESTUANS,41.955,-87.801,9,3650.0,0,86,61,...,29.28,29.99,8.9,18,10.0,normal,11,6,2008,2
2,3,2008-06-11,CULEX PIPIENS,41.955,-87.801,9,3650.0,0,86,61,...,29.28,29.99,8.9,18,10.0,normal,11,6,2008,2
3,4,2008-06-11,CULEX SALINARIUS,41.955,-87.801,9,3650.0,0,86,61,...,29.28,29.99,8.9,18,10.0,normal,11,6,2008,2
4,5,2008-06-11,CULEX TERRITANS,41.955,-87.801,9,3650.0,0,86,61,...,29.28,29.99,8.9,18,10.0,normal,11,6,2008,2


In [15]:
testDf.columns

Index(['Id', 'Date', 'Species', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Most_Recent_Spray', 'Recently_Sprayed', 'Tmax', 'Tmin', 'Tavg',
       'DewPoint', 'WetBulb', 'Heat', 'Cool', 'CodeSum', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'Weather', 'Day_of_Month', 'Month', 'Year', 'Day_of_Week'],
      dtype='object')

In [16]:
testDf.shape

(116293, 27)

In [17]:
testDf.drop(columns=['Id','Date', 'Species', 'AddressAccuracy', 'CodeSum', 'Day_of_Month','Month', 'Year',
       'Day_of_Week'], axis =1, inplace =True)

In [18]:
testDf.shape

(116293, 18)

In [19]:
testDf.Weather.unique()

array(['normal', 'coldAndWet', 'hotAndDry'], dtype=object)

In [20]:
testDf.Weather = testDf.Weather.map(new_values)

In [21]:
testDf.Weather.unique()

array([0, 1, 2])

In [22]:
testDf.columns

Index(['Latitude', 'Longitude', 'Most_Recent_Spray', 'Recently_Sprayed',
       'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat', 'Cool',
       'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Weather'],
      dtype='object')

### Preprocessing & Modeling

In [23]:
#assign target and result to y and X for training set then carry out train test split
X = trainDf.drop(columns=['WnvPresent', "NumMosquitos"])
y = trainDf.WnvPresent

In [24]:
#assign target and result to y and X for test set
X_testset = testDf
y = trainDf.WnvPresent

In [25]:
# Import train_test_split.
from sklearn.model_selection import train_test_split

# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

#### Baseline Accuracy 

In [26]:
y.value_counts(normalize=True)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

#### Train/Test Split

In [27]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#### Balance Classes: Oversampling (SMOTE)

In [28]:
# Oversampling on training data only
X_res, y_res = SMOTE().fit_resample(X_train, y_train)

In [29]:
print(X_res.shape)
print(y_res.shape)

(13936, 18)
(13936,)


#### Test/Train Split

In [30]:
# Create train_test_split after oversampling
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_res, y_res, test_size=0.3, random_state=42, stratify=y_res)

In [31]:
print(X_train_2.shape)
print(y_train_2.shape)
print(X_test_2.shape)
print(y_test_2.shape)

(9755, 18)
(9755,)
(4181, 18)
(4181,)


#### Random Forest

In [32]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_2, y_train_2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [33]:
cross_val_score(rf, X_train_2, y_train_2, cv=5).mean()

0.9152229625832906

In [34]:
y_preds = rf.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

              precision    recall  f1-score   support

           0       0.96      0.91      0.94      2987
           1       0.18      0.36      0.24       165

    accuracy                           0.88      3152
   macro avg       0.57      0.64      0.59      3152
weighted avg       0.92      0.88      0.90      3152



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,2715,272
Actual Positive,105,60


In [35]:
zipped = list(zip(X.columns.values, rf.feature_importances_))
sorted_importance = sorted(zipped, key=itemgetter(1), reverse=True)
sorted_importance

[('Longitude', 0.19206615584972606),
 ('Latitude', 0.1860838909532382),
 ('ResultSpeed', 0.0654640760874245),
 ('SeaLevel', 0.060003048317635796),
 ('Tmax', 0.05833964131333272),
 ('Cool', 0.055398324937391975),
 ('WetBulb', 0.0545353775277728),
 ('AvgSpeed', 0.05107575638954828),
 ('Tavg', 0.04909080946340867),
 ('DewPoint', 0.04875006524323967),
 ('ResultDir', 0.048374171225699786),
 ('Tmin', 0.04354017899811171),
 ('StnPressure', 0.033606988259165325),
 ('PrecipTotal', 0.022199398193408194),
 ('Weather', 0.015041098638262947),
 ('Heat', 0.006573401835111475),
 ('Most_Recent_Spray', 0.006181699825868504),
 ('Recently_Sprayed', 0.0036759169416533026)]

In [36]:
pd.DataFrame(sorted_importance[0:5], columns=['Feature', 'Importance'])

Unnamed: 0,Feature,Importance
0,Longitude,0.192066
1,Latitude,0.186084
2,ResultSpeed,0.065464
3,SeaLevel,0.060003
4,Tmax,0.05834


In [37]:
proba_pairs = rf.predict_proba(X_test)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

0.751429933753335

#### Random Forest with GridSearch 

In [42]:
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'min_samples_split':[2, 5 ,10]
}

rf_gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose = 1,n_jobs = -1)
rf_gs.fit(X_train_2, y_train_2)
print(rf_gs.best_score_)
rf_gs.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:   48.6s finished


0.915325474115838


{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

In [43]:
y_preds = rf_gs.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

              precision    recall  f1-score   support

           0       0.96      0.91      0.94      2987
           1       0.18      0.36      0.24       165

    accuracy                           0.88      3152
   macro avg       0.57      0.63      0.59      3152
weighted avg       0.92      0.88      0.90      3152



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,2716,271
Actual Positive,106,59


In [44]:
proba_pairs = rf_gs.predict_proba(X_test)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

0.753722697345061

#### Random Forest with RandomizedSearch

In [46]:
rf_rand = RandomizedSearchCV(RandomForestClassifier(), rf_params, cv=5, n_iter=50, random_state=42, n_jobs = -1, verbose = 1)
rf_rand.fit(X_train_2, y_train_2)
print(rf_rand.best_score_)
rf_rand.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   47.0s finished


0.9157355202460277


{'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None}

In [47]:
y_preds = rf_rand.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      2987
           1       0.17      0.37      0.24       165

    accuracy                           0.87      3152
   macro avg       0.57      0.64      0.58      3152
weighted avg       0.92      0.87      0.89      3152



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,2694,293
Actual Positive,104,61


In [48]:
proba_pairs = rf_rand.predict_proba(X_test)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

0.7735003195666068

#### Comparing ROC AUC Scores

1. Random Forest is 0.7488734008988445
2. Random Forest with GridSearch is 0.7686084142394822
3. Random Forest with RandomizedSearch is 0.749702245082225

Since Random Forest with GridSearch has the highest ROC AUC Score, we will be moving forth with it.

In [49]:
roc_auc = auc(fpr, tpr)

fix, ax = plt.subplots(figsize=(14,10))

plt.title('Receiver Operating Characteristic', fontsize=20)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right', fontsize=14)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)
plt.show()

NameError: name 'fpr' is not defined

### Kaggle Submission

In [50]:
X_testset = ss.transform(X_testset)

In [51]:
proba_pairs = rf_rand.predict_proba(X_testset)
probas = [item[1] for item in proba_pairs]

In [52]:
submission = pd.DataFrame(probas)

In [53]:
submission.index += 1

In [54]:
submission.head()

Unnamed: 0,0
1,0.377
2,0.377
3,0.377
4,0.377
5,0.377


In [55]:
submission.rename(columns={0: "WnvPresent"}, inplace=True)

In [56]:
submission.head()

Unnamed: 0,WnvPresent
1,0.377
2,0.377
3,0.377
4,0.377
5,0.377


In [57]:
submission.reset_index(level=0, inplace=True)
submission.rename(columns={"index": "ID"}, inplace=True)
submission.head()

Unnamed: 0,ID,WnvPresent
0,1,0.377
1,2,0.377
2,3,0.377
3,4,0.377
4,5,0.377


In [58]:
submission.to_csv('./kaggle_submission.csv', index=False)

We've obtained a Kaggle Submission Score of 0.54552

Possible ways of improving the model will be including features for cumulative weather conditions:

1. Average temperature over the past week
2. Average precipitation over the past week 
3. Number of days without preciptiation.