In [1]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages
Requirement already up-to-date: scikit-learn in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages (from imbalanced-learn)
Requirement already up-to-date: numpy in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages (from imbalanced-learn)
Requirement already up-to-date: scipy in /Users/veraburfield/anaconda3/envs/dsi/lib/python3.6/site-packages (from imbalanced-learn)
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [58]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [59]:
train_data   = pd.read_csv('./train_final.csv')
test_data    = pd.read_csv('./test_final.csv')
spray_data   = pd.read_csv('./assets/spray.csv')

In [60]:
train_data.shape, test_data.shape

((10506, 32), (116293, 31))

In [61]:
def  date_separate (df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [62]:
train_data  = date_separate(train_data)
test_data  = date_separate(test_data)

In [63]:
test_data['Species'].value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

In [64]:
#adding to pipiens/restauns, so it can maybe learn from those, since those are the ones that carry wnv?
#so it makes it extra sensitive?

new_species = []
for i in test_data['Species']:
    if i == 'UNSPECIFIED CULEX':
        new_species.append('CULEX PIPIENS/RESTUANS')
    else:
        new_species.append(i)
test_data['Species'] = new_species        

In [65]:
test_data['Species'].value_counts()

CULEX PIPIENS/RESTUANS    29704
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
Name: Species, dtype: int64

In [66]:
train_data['Species'].value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

In [67]:
test_data.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'AvgSpeed', 'CodeSum', 'Daytime', 'Depart', 'DewPoint', 'PrecipTotal',
       'ResultDir', 'ResultSpeed', 'SeaLevel', 'StnPressure', 'Sunrise',
       'Sunrise_min', 'Sunset', 'Sunset_min', 'Tavg', 'Tmax', 'Tmin',
       'WetBulb', 'PrecipCat', 'NewCodeSum', 'Year', 'Month', 'Day'],
      dtype='object')

In [68]:
#turns out some addresses have missing zipcodes, maybe calculate based on lat/long?

import re
zipcodes = []
for i in train_data['Address']:
    reg = re.compile('\d{5}')
    zipcodes.append(reg.findall(i)) 
#train_data['Zipcode'] = zipcodes
#train_data['Zipcode'].astype(int)


zipcodes2 = []
for i in test_data['Address']:
    reg = re.compile('\d{5}')
    zipcodes2.append(reg.findall(i)) 
#test_data['Zipcode'] = zipcodes2
#train_data['Zipcode'].astype(int)
len(zipcodes), len(zipcodes2)

(10506, 116293)

In [69]:
train_data['Trap'] = [x.strip('TABCabc') for x in train_data['Trap']]
train_data['Trap'].astype(int)

test_data['Trap'] = [x.strip('TABCabc') for x in test_data['Trap']]
test_data['Trap'].astype(int)

0           2
1           2
2           2
3           2
4           2
5           2
6           2
7           2
8           7
9           7
10          7
11          7
12          7
13          7
14          7
15          7
16         15
17         15
18         15
19         15
20         15
21         15
22         15
23         15
24         45
25         45
26         45
27         45
28         45
29         45
         ... 
116263    238
116264    238
116265    238
116266    238
116267    238
116268    238
116269     65
116270     65
116271     65
116272     65
116273     65
116274     65
116275     65
116276     65
116277     94
116278     94
116279     94
116280     94
116281     94
116282     94
116283     94
116284     94
116285     54
116286     54
116287     54
116288     54
116289     54
116290     54
116291     54
116292     54
Name: Trap, Length: 116293, dtype: int64

In [70]:
#add zipcodes to dummies?

train_data.drop(columns=['AddressNumberAndStreet', 'Date',
                        'CodeSum',
                         'Street', 'Address'], axis=1, inplace=True)

train_data = pd.get_dummies(train_data, columns=['Trap','Species', 'NewCodeSum','PrecipCat'], drop_first=True)

test_data.drop(columns=['AddressNumberAndStreet','Date',
                        'CodeSum','Street','Address'], axis=1, inplace=True)

test_data = pd.get_dummies(test_data, columns=['Trap','Species','NewCodeSum','PrecipCat'], drop_first=True)

In [71]:
train_data.shape, test_data.shape

((10506, 168), (116293, 168))

In [72]:
#making sure index is in order
test_data = test_data.sort_values(by=['Id'])

In [73]:
y = train_data['WnvPresent']
X = train_data.drop(['WnvPresent'], axis=1)

In [74]:
[col for col in X.columns if col not in test_data.columns]

['NumMosquitos']

In [75]:
#Errr no number of mosquitoes in the test set?

In [76]:
[col for col in test_data.columns if col not in X.columns]

['Id', 'Trap_234']

In [77]:
all_cols = X.columns.union(test_data.columns)

X = X.assign(**{col:0 for col in all_cols.difference(X.columns).tolist()})
test_data = test_data.assign(**{col:0 for col in all_cols.difference(test_data.columns).tolist()})

In [78]:
test_data = test_data[X.columns]
assert (test_data.columns == X.columns).all().all()

In [79]:
train_data.shape, test_data.shape, X.shape

((10506, 168), (116293, 169), (10506, 169))

In [80]:
#Wait! I lost IDs. But maybe its ok since I made sure all rows are in the same order
test_data.head(1)

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,AvgSpeed,Daytime,Depart,DewPoint,PrecipTotal,...,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,NewCodeSum_1,PrecipCat_Precip,PrecipCat_Trace,Id,Trap_234
0,41,41.95469,-87.800991,9,0,10.0,910,7,56,0.0,...,1,0,0,0,0,1,0,0,1,0


In [81]:
#TrainTestSplit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [82]:
from sklearn.preprocessing import StandardScaler 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
test_data = ss.transform(test_data)

In [83]:
sm = SMOTE()

X_train, y_train = sm.fit_sample(X_train, y_train)

### Logistic Regression

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
 

logreg = LogisticRegression()

params = {
}


gs_log = GridSearchCV(estimator = logreg, param_grid = params, scoring='roc_auc')
gs_log.fit(X_train, y_train)
print(gs_log.best_score_)
print(gs_log.best_params_)

print(gs_log.score(X_train,y_train))
print(gs_log.score(X_test,y_test))

0.916908308095004
{}
0.9209872773359778
0.8140135436500313


In [85]:
from sklearn.metrics import confusion_matrix

y_pred = gs_log.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 1999, fp: 490, fn: 45, tp: 93


In [86]:
len(gs_log.predict(test_data)), sum(gs_log.predict(test_data))

(116293, 6495)

In [87]:
predictions = gs_log.predict(test_data)

In [88]:
sample_submission = pd.read_csv('./assets/sampleSubmission.csv')
sample_submission['WnvPresent'] = predictions
sample_submission.to_csv('submission_logreg_new.csv', index=False)

### KNN

In [33]:
# knn takes forever

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()

knn_c_pipe = Pipeline([ 
    ('knn',knn),
])
#knn_c_pipe.fit(X_train, y_train)
#knn_c_pipe.score(X_test, y_test)

In [35]:
#knn_c_pipe.get_params().keys()

### Decision Tree Classifier

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dt_c = DecisionTreeClassifier()


params = {
    'max_depth':[ 40, 70, 100, None],
    'max_features':[10, 30, 50, None]
}


gs_dt = GridSearchCV(estimator = dt_c, param_grid = params, scoring='roc_auc')
gs_dt.fit(X_train, y_train)
print(gs_dt.best_score_)
print(gs_dt.best_params_)

0.9423208957524398
{'max_depth': 40, 'max_features': 50}


In [37]:
dt_c.get_params().keys()

dict_keys(['class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [38]:
len(gs_dt.predict(test_data)), sum(gs_dt.predict(test_data))

(116293, 24872)

In [39]:
y_pred = gs_dt.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2354, fp: 135, fn: 102, tp: 36


### Random Forest Classifier

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf_c = RandomForestClassifier()


params = {
    'n_estimators': [10, 15, 20], 
    'max_features': [7, 10, 15, 20],
    'max_depth': [None, 4, 10]
}
gs_rf = GridSearchCV(rf_c, param_grid=params)
gs_rf.fit(X_train, y_train)
print(gs_rf.best_score_)
print(gs_rf.best_params_)

0.9571373899732109
{'max_depth': None, 'max_features': 7, 'n_estimators': 20}


In [54]:
rf_c.get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [55]:
len(gs_rf.predict(test_data)), sum(gs_rf.predict(test_data))

(116293, 8720)

In [56]:
y_pred = gs_rf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2429, fp: 60, fn: 112, tp: 26


### AdaBoost

In [44]:
from sklearn.ensemble import AdaBoostClassifier
ada_r = AdaBoostClassifier()

params = {
    'n_estimators':range(1,7)
}
gs_ada = GridSearchCV(ada_r, param_grid = params)
gs_ada.fit(X_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

0.8436280137772675
{'n_estimators': 4}


In [45]:
gs_ada.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__algorithm', 'estimator__base_estimator', 'estimator__learning_rate', 'estimator__n_estimators', 'estimator__random_state', 'estimator', 'fit_params', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [46]:
len(gs_ada.predict(test_data)), sum(gs_ada.predict(test_data))

(116293, 456)

In [47]:
y_pred = gs_ada.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

tn: 2126, fp: 363, fn: 61, tp: 77


### Support Vector Classifier

In [48]:
from sklearn.svm import SVC
clf_c = SVC(kernel='linear')

clf_c_pipe = Pipeline([ 
    ('clf',clf_c), 
])

#clf_c_pipe.fit(X_train, y_train)
#print(clf_c_pipe.score(X_test, y_test))

In [49]:
#clf_c_pipe.get_params().keys()

In [50]:
#len(clf_c_pipe.predict(test_data)), sum(clf_c_pipe.predict(test_data))

In [51]:
y_pred = clf_c_pipe.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt

In [None]:
#ss = StandardScaler()
#X_train = ss.fit_transform(X_train)
#X_test = ss.transform(X_test)

In [None]:
from keras import regularizers
import numpy as np
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

np.random.seed(42)

model_4 = Sequential()

n_input = X_train.shape[1]
n_hidden = n_input
n_output = 1

model_4.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
model_4.add(Dense(n_output, activation='sigmoid'))

model_4.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['acc'])

#early_stop = EarlyStopping(monitor='val_loss', min_delta=0) 


history = model_4.fit(X_train, y_train, validation_data=(X_test, y_test),
                      epochs=30, batch_size=None)
                     # callbacks=[early_stop])

In [None]:
train_loss_2 = history.history['loss']
test_loss_2 = history.history['val_loss']
plt.plot(train_loss_2, label='Training loss')
plt.plot(test_loss_2, label='Testing loss')
plt.legend()

In [None]:
len(model_4.predict(test_data)), sum(model_4.predict(test_data))

### Evaluate

In [None]:
from sklearn.metrics import confusion_matrix

def evaluate (X_test, y_test, model):
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return("tn: {}, fp: {}, fn: {}, tp: {}".format (tn, fp, fn, tp))

In [None]:
models = [logreg, gs_dt, gs_rf, gs_ada, clf_c_pipe]
for model in models:
    print (evaluate (X_test, y_test, model))