In [1]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages
Requirement already up-to-date: scikit-learn in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages (from imbalanced-learn)
Collecting numpy (from imbalanced-learn)
  Downloading https://files.pythonhosted.org/packages/f6/cd/b2c50b5190b66c711c23ef23c41d450297eb5a54d2033f8dcb3b8b13ac85/numpy-1.14.5-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.7MB)
[K    100% |████████████████████████████████| 4.7MB 292kB/s ta 0:00:011
[?25hRequirement already up-to-date: scipy in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages (from imbalanced-learn)
Installing collected packages: numpy
  Found existing installation: numpy 1.14.4
    Uninstalling numpy-1.14.4:
      Successfully uninstalled numpy-1.14.4
Successfully installed numpy-1.14.5
[33mYou are using pip version 9.0.3, however v

In [83]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [84]:
train_data   = pd.read_csv('./train_final.csv')
test_data    = pd.read_csv('./test_final.csv')
spray_data   = pd.read_csv('./assets/spray.csv')

In [85]:
train_data.shape, test_data.shape

((10506, 32), (116293, 31))

In [86]:
def  date_separate (df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [87]:
train_data  = date_separate(train_data)
test_data  = date_separate(test_data)

In [88]:
test_data['Species'].value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

In [89]:
new_species = []
contag = ['CULEX SALINARIUS','CULEX TERRITANS','CULEX ERRATICUS']
for i in test_data['Species']:
    if i in contag:
        new_species.append(0)
    else:
        new_species.append(1)
test_data['Species'] = new_species  

In [90]:
new_species = []
contag = ['CULEX SALINARIUS','CULEX TERRITANS','CULEX ERRATICUS']
for i in train_data['Species']:
    if i in contag:
        new_species.append(0)
    else:
        new_species.append(1)
train_data['Species'] = new_species  

In [91]:
test_data['Species'].value_counts()

1    58572
0    57721
Name: Species, dtype: int64

In [92]:
train_data['Species'].value_counts()

1    7457
0    3049
Name: Species, dtype: int64

In [93]:
test_data.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'AvgSpeed', 'CodeSum', 'Daytime', 'Depart', 'DewPoint', 'PrecipTotal',
       'ResultDir', 'ResultSpeed', 'SeaLevel', 'StnPressure', 'Sunrise',
       'Sunrise_min', 'Sunset', 'Sunset_min', 'Tavg', 'Tmax', 'Tmin',
       'WetBulb', 'PrecipCat', 'NewCodeSum', 'Year', 'Month', 'Day'],
      dtype='object')

In [94]:
#turns out some addresses have missing zipcodes, maybe calculate based on lat/long?

import re
zipcodes = []
for i in train_data['Address']:
    reg = re.compile('\d{5}')
    zipcodes.append(reg.findall(i)) 
#train_data['Zipcode'] = zipcodes
#train_data['Zipcode'].astype(int)


zipcodes2 = []
for i in test_data['Address']:
    reg = re.compile('\d{5}')
    zipcodes2.append(reg.findall(i)) 
#test_data['Zipcode'] = zipcodes2
#train_data['Zipcode'].astype(int)
len(zipcodes), len(zipcodes2)

(10506, 116293)

In [95]:
train_data['Trap'] = [x.strip('TABCabc') for x in train_data['Trap']]
train_data['Trap'].astype(int)

test_data['Trap'] = [x.strip('TABCabc') for x in test_data['Trap']]
test_data['Trap'].astype(int)

0           2
1           2
2           2
3           2
4           2
5           2
6           2
7           2
8           7
9           7
10          7
11          7
12          7
13          7
14          7
15          7
16         15
17         15
18         15
19         15
20         15
21         15
22         15
23         15
24         45
25         45
26         45
27         45
28         45
29         45
         ... 
116263    238
116264    238
116265    238
116266    238
116267    238
116268    238
116269     65
116270     65
116271     65
116272     65
116273     65
116274     65
116275     65
116276     65
116277     94
116278     94
116279     94
116280     94
116281     94
116282     94
116283     94
116284     94
116285     54
116286     54
116287     54
116288     54
116289     54
116290     54
116291     54
116292     54
Name: Trap, Length: 116293, dtype: int64

In [96]:
#add zipcodes to dummies?

train_data.drop(columns=['AddressNumberAndStreet', 'Date',
                        'CodeSum',
                         'Street', 'Address'], axis=1, inplace=True)

train_data = pd.get_dummies(train_data, columns=['Trap','Species', 'NewCodeSum','PrecipCat'], drop_first=True)

test_data.drop(columns=['AddressNumberAndStreet','Date',
                        'CodeSum','Street','Address'], axis=1, inplace=True)

test_data = pd.get_dummies(test_data, columns=['Trap','Species','NewCodeSum','PrecipCat'], drop_first=True)

In [97]:
train_data.shape, test_data.shape

((10506, 163), (116293, 163))

In [98]:
#making sure index is in order
test_data = test_data.sort_values(by=['Id'])

In [99]:
y = train_data['WnvPresent']
X = train_data.drop(['WnvPresent'], axis=1)

In [100]:
[col for col in X.columns if col not in test_data.columns]

['NumMosquitos']

In [101]:
#Errr no number of mosquitoes in the test set?

In [102]:
[col for col in test_data.columns if col not in X.columns]

['Id', 'Trap_234']

In [103]:
all_cols = X.columns.union(test_data.columns)

X = X.assign(**{col:0 for col in all_cols.difference(X.columns).tolist()})
test_data = test_data.assign(**{col:0 for col in all_cols.difference(test_data.columns).tolist()})

In [104]:
test_data = test_data[X.columns]
assert (test_data.columns == X.columns).all().all()

In [105]:
train_data.shape, test_data.shape, X.shape

((10506, 163), (116293, 164), (10506, 164))

In [106]:
#Wait! I lost IDs. But maybe its ok since I made sure all rows are in the same order
test_data.head(1)

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,AvgSpeed,Daytime,Depart,DewPoint,PrecipTotal,...,Trap_237,Trap_238,Trap_900,Trap_903,Species_1,NewCodeSum_1,PrecipCat_Precip,PrecipCat_Trace,Id,Trap_234
0,41,41.95469,-87.800991,9,0,10.0,910,7,56,0.0,...,0,0,0,0,1,1,0,0,1,0


In [107]:
#TrainTestSplit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [108]:
from sklearn.preprocessing import StandardScaler 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
test_data = ss.transform(test_data)

In [109]:
sm = SMOTE()

X_train, y_train = sm.fit_sample(X_train, y_train)

### Logistic Regression

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
 

logreg = LogisticRegression()

params = {
}


gs_log = GridSearchCV(estimator = logreg, param_grid = params, scoring='roc_auc')
gs_log.fit(X_train, y_train)
print(gs_log.best_score_)
print(gs_log.best_params_)

print(gs_log.score(X_train,y_train))
print(gs_log.score(X_test,y_test))

0.9118456021860675
{}
0.9164519569705372
0.8116189494646007


In [111]:
from sklearn.metrics import confusion_matrix

y_pred = gs_log.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 1983, fp: 506, fn: 44, tp: 94


In [112]:
len(gs_log.predict(test_data)), sum(gs_log.predict(test_data))

(116293, 9656)

In [113]:
predictions = gs_log.predict(test_data)

In [114]:
sample_submission = pd.read_csv('./assets/sampleSubmission.csv')
sample_submission['WnvPresent'] = predictions
sample_submission.to_csv('submission_logreg_new3.csv', index=False)

### KNN

In [34]:
# knn takes forever

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()

knn_c_pipe = Pipeline([ 
    ('knn',knn),
])
#knn_c_pipe.fit(X_train, y_train)
#knn_c_pipe.score(X_test, y_test)

In [36]:
#knn_c_pipe.get_params().keys()

### Decision Tree Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dt_c = DecisionTreeClassifier()


params = {
    'max_depth':[ 40, 70, 100, None],
    'max_features':[10, 30, 50, None]
}


gs_dt = GridSearchCV(estimator = dt_c, param_grid = params, scoring='roc_auc')
gs_dt.fit(X_train, y_train)
print(gs_dt.best_score_)
print(gs_dt.best_params_)

0.9546193667875416
{'max_depth': 100, 'max_features': 10}


In [38]:
dt_c.get_params().keys()

dict_keys(['class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [39]:
len(gs_dt.predict(test_data)), sum(gs_dt.predict(test_data))

(116293, 41151)

In [40]:
y_pred = gs_dt.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2360, fp: 129, fn: 98, tp: 40


### Random Forest Classifier

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf_c = RandomForestClassifier()


params = {
    'n_estimators': [10, 15, 20], 
    'max_features': [7, 10, 15, 20],
    'max_depth': [None, 4, 10]
}
gs_rf = GridSearchCV(rf_c, param_grid=params)
gs_rf.fit(X_train, y_train)
print(gs_rf.best_score_)
print(gs_rf.best_params_)

0.9588802571658184
{'max_depth': None, 'max_features': 7, 'n_estimators': 15}


In [42]:
rf_c.get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [43]:
len(gs_rf.predict(test_data)), sum(gs_rf.predict(test_data))

(116293, 20694)

In [44]:
y_pred = gs_rf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2407, fp: 82, fn: 102, tp: 36


### AdaBoost

In [45]:
from sklearn.ensemble import AdaBoostClassifier
ada_r = AdaBoostClassifier()

params = {
    'n_estimators':range(1,7)
}
gs_ada = GridSearchCV(ada_r, param_grid = params)
gs_ada.fit(X_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

0.8516608625770158
{'n_estimators': 6}


In [46]:
gs_ada.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__algorithm', 'estimator__base_estimator', 'estimator__learning_rate', 'estimator__n_estimators', 'estimator__random_state', 'estimator', 'fit_params', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [47]:
len(gs_ada.predict(test_data)), sum(gs_ada.predict(test_data))

(116293, 405)

In [48]:
y_pred = gs_ada.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2083, fp: 406, fn: 44, tp: 94


### Support Vector Classifier

In [49]:
from sklearn.svm import SVC
clf_c = SVC(kernel='linear')

clf_c_pipe = Pipeline([ 
    ('clf',clf_c), 
])

#clf_c_pipe.fit(X_train, y_train)
#print(clf_c_pipe.score(X_test, y_test))

In [50]:
#clf_c_pipe.get_params().keys()

In [51]:
#len(clf_c_pipe.predict(test_data)), sum(clf_c_pipe.predict(test_data))

In [52]:
y_pred = clf_c_pipe.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt

In [None]:
#ss = StandardScaler()
#X_train = ss.fit_transform(X_train)
#X_test = ss.transform(X_test)

In [None]:
from keras import regularizers
import numpy as np
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

np.random.seed(42)

model_4 = Sequential()

n_input = X_train.shape[1]
n_hidden = n_input
n_output = 1

model_4.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
model_4.add(Dense(n_output, activation='sigmoid'))

model_4.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['acc'])

#early_stop = EarlyStopping(monitor='val_loss', min_delta=0) 


history = model_4.fit(X_train, y_train, validation_data=(X_test, y_test),
                      epochs=30, batch_size=None)
                     # callbacks=[early_stop])

In [None]:
train_loss_2 = history.history['loss']
test_loss_2 = history.history['val_loss']
plt.plot(train_loss_2, label='Training loss')
plt.plot(test_loss_2, label='Testing loss')
plt.legend()

In [None]:
len(model_4.predict(test_data)), sum(model_4.predict(test_data))

### Evaluate

In [None]:
from sklearn.metrics import confusion_matrix

def evaluate (X_test, y_test, model):
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

In [None]:
models = [logreg, gs_dt, gs_rf, gs_ada, clf_c_pipe]
for model in models:
    print (evaluate (X_test, y_test, model))