In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', 50)


Reading in train and test datasets, resetting the date as index

In [3]:
df = pd.read_csv('train_weather_per_station.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace = True)
df.head()

Unnamed: 0_level_0,Num_Duplicates,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,...,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,1,0,2007,5,29,0,0,1,0,0,...,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,...,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,...,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,1,0,0,...,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,1,0,2007,5,29,0,0,0,1,0,...,74,10,58,65,9,421,1917,0.0,5.8,18


In [4]:
test = pd.read_csv('test_weather_per_station.csv')
test['Date'] = pd.to_datetime(test['Date'])
test.set_index('Date', inplace = True)
test.head()

Unnamed: 0_level_0,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,...,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-06-11,1,2008,6,11,0,0,1,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,1,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,1,0,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,0,1,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,0,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18


In [5]:
print(f'Number of columns in the train set: {df.shape[1]}')
print(f'Number of columns in the test set: {test.shape[1]}')

Number of columns in the train set: 24
Number of columns in the test set: 24


Since there is a column of Unspecified Species in the test data that is not in the train data, I substitute that species in the test data with the most popular species.

In [6]:
# checking which is the most popular species 

test[['CULEX ERRATICUS','CULEX PIPIENS','CULEX PIPIENS/RESTUANS','CULEX RESTUANS','CULEX SALINARIUS',
     'CULEX TARSALIS','CULEX TERRITANS']].sum()

CULEX ERRATICUS           14345
CULEX PIPIENS             14521
CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX SALINARIUS          14355
CULEX TARSALIS            14347
CULEX TERRITANS           14351
dtype: int64

In [7]:
# assigning culex pipiens/restuans to the unspecified culex
for index, row in test.iterrows():
    if row['UNSPECIFIED CULEX'] == 1:
        test.loc[index,'CULEX PIPIENS/RESTUANS'] = 1

# check if the number of culex pipiens/restuans has actually increased
test['CULEX PIPIENS/RESTUANS'].sum()

116293

In [8]:
# droppig the unspecified culex
test.drop('UNSPECIFIED CULEX', axis = 1, inplace = True)
test.head()

Unnamed: 0_level_0,Num_Duplicates,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,...,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-06-11,1,2008,6,11,0,0,1,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,1,1,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,1,1,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,1,0,1,0,...,74,7,56,64,9,416,1926,0.0,8.9,18
2008-06-11,1,2008,6,11,0,0,1,0,0,0,...,74,7,56,64,9,416,1926,0.0,8.9,18


In [11]:
features = [feat for feat in df.columns if feat != "WnvPresent"]

X = df[features]
y = df['WnvPresent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 17)

# KNN model

In [12]:
k = np.arange(3,30,2)
parameters = {'n_neighbors': k,
              'p': [1,2] }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn,parameters,cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]), 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [13]:
clf.best_params_

{'n_neighbors': 5, 'p': 1}

In [14]:
y_pred = clf.predict(X_test)
auc_knn = roc_auc_score(y_test, y_pred)
auc_knn

0.802118754312734

# Logistic Regression model

In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
auc_logreg = roc_auc_score(y_test, y_pred)
auc_logreg

0.6684732073166594

# Random Forest w/ Grid Search

In [19]:
params = {'n_estimators': [10,15,20],
          'max_depth': [40,50,60]}
rfc = RandomForestClassifier() 
rfcl = GridSearchCV(rfc,params,cv=10)
rfcl.fit(X_train, y_train)
rfcl.best_params_

{'max_depth': 50, 'n_estimators': 20}

In [20]:
y_pred = rfcl.predict(X_test)
auc_rfcl = roc_auc_score(y_test, y_pred)
auc_rfcl

0.8203719414938744