### Model Building Starter

Below are all the relevant dataframes

Only thing that remains to be done is aggregate for when there are multiple observations for a single trap/day/species  


`df_w` include all the aggregated weather  
`df_a` includes weather and park/water data (note park/water data are svd features  
`df_s` includes all the spray data (so only years 2011/13) with categorically encoded "spray before" and "spray after" Variables  


No train/test splits have been done, but what we discussed was as standard train/test split for the spray data, and 4-fold CV for the other data, split by year (there is a 'yr' column in the dfs)

In [7]:
import pandas as pd
import pickle

with open('allDF.pickle','rb') as f:
    allDF = pickle.load(f)
    
df_w = allDF[0]
df_a = allDF[1]
df_s = allDF[2]

In [8]:
df = df_a

In [9]:
df.columns

Index(['NumMosquitos', 'WnvPresent', 'Date', 'Species', 'Trap', 'Latitude',
       'Longitude', 'Yr', 'Mo', 'Week', 'Location', 'CULEX PIPIENS',
       'CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX SALINARIUS',
       'CULEX TERRITANS', 'CULEX TARSALIS', 'CULEX ERRATICUS', 'DewPointamax',
       'DewPointamin', 'DewPointmean', 'StnPressureamax', 'StnPressureamin',
       'StnPressuremean', 'AvgSpeedamax', 'AvgSpeedamin', 'AvgSpeedmean',
       'temp_max', 'temp_min', 'temp_avg', 'temp_expected', 'temp_diff',
       'sunset', 'precip_total', 'precip_avg', 'Park0', 'Park1', 'Park2',
       'Park3', 'Park4', 'Park5', 'Water0', 'Water1', 'Water2', 'Water3',
       'Water4', 'Water5'],
      dtype='object')

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
ss = StandardScaler()

In [13]:
test = df[df['Yr']==2009]
train = df[df['Yr']!= 2009]

y_train = train['WnvPresent']
y_test = test['WnvPresent']

toDrop = ['Date','WnvPresent','Location','NumMosquitos', 'Species','Trap','Location','Yr',
          #'Date_end',
          #'DewPointamax', 'DewPointamin','temp_max', 'temp_min',
          #'Park0', 'Park1', 'Park2','Park3', 'Park4', 'Park5', 
          #'Water0', 'Water1', 'Water2', 'Water3','Water4', 'Water5'
         ]
X_train0 = train.drop(toDrop, axis = 'columns')
#X_train = X_train.iloc[:,4:] # dropping species, trap, date
X_test0 = test.drop(toDrop, axis = 'columns')
#X_test = X_test.iloc[:,4:] # dropping species, trap, date

In [14]:
X_train = ss.fit_transform(X_train0)
X_test = ss.transform(X_test0)

In [98]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold


def best_model(model_list, X_train, y_train):
    """
    Runs a randomized search of 10 iters, and strat-kfold cv
    And produces a list of classifiers - parameters - scores
    for the best of each type of model
    
    To change it to ONLY spit out the best model, replace 
    
    'return classifier' 
    
    for
    
    'return best_classifier[1]'
    """
    best_score = 0.0
    best_classifier = None
    classifiers = []
    for name, model, parameters in model_list:
        classifiers.append(best_config(name, model, parameters,
                                       X_train,
                                       y_train))
 
    for name, score, classifier in classifiers:
        if (score > best_score):
            best_score = score
            best_classifier = [name, classifier]
    return classifiers

def best_config(name, model, parameters, X_train, y_train):
    """
    A simple GridSearch 
    Requires: model, parameters, X & y variables
    +
    """
    k = StratifiedKFold(n_splits=5)
    print("Searching "+str(name))
    clf = RandomizedSearchCV(model, parameters[0], cv=k, 
                            n_iter=15, verbose=1, 
                             n_jobs=2)
    clf.fit(X_train, y_train)
    print("Finished "+str(name))
    print('')
    return [str(clf.best_params_), 
            clf.best_score_, 
            clf.best_estimator_]

def model_list():
    
    """
    Generates a list of models & params,
    Add more manually, for more options
    gb = GradientBoostingClassifier()
    rf = RandomForestClassifier()
    bg = BaggingClassifier()
    ad = AdaBoostClassifier()
    knn = KNeighborsClassifier()
    """
    
    models = []    

    #gradient
    gb_tuned_parameters = [{"n_estimators": [50, 75, 100, 150, 200, 250],
                           'learning_rate' : [0.05, 0.1, 0.2], 
                            'min_samples_split' : [2, 3, 4], 
                            'max_depth' : [2, 3, 4, 5]}]
    models.append(['GB', GradientBoostingClassifier(), gb_tuned_parameters])
 
    #random forest
    rf_tuned_parameters = [{"max_depth": [None, 2, 3, 4, 5],
                           'criterion' : ['gini'], 
                            'min_samples_split' : [2, 3, 4], 
                            'min_samples_leaf' : [1, 2, 3]}]
    models.append(["RandomForest",RandomForestClassifier(n_jobs=-1),rf_tuned_parameters])

    #Bagged
    bg_tuned_parameters = [{"n_estimators": [x for x in range(2,20)]}]
    models.append(['BG', BaggingClassifier(), bg_tuned_parameters])

    #Ada Model
    ad_tuned_parameters = [{"n_estimators": [x for x in range(2,20)]}]
    models.append(['AD', AdaBoostClassifier(), ad_tuned_parameters])
    
    #kNN Model
    knn_tuned_parameters = [{"n_neighbors": [1, 3, 5, 10, 20], 
                             'weights':['uniform','distance'],
                            'metric':['euclidean','manhattan']}]
    models.append(["kNN", KNeighborsClassifier(),knn_tuned_parameters])
    return models

In [99]:
GS_model = best_model(model_list(),X_train,y_train)

Searching GB
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   48.7s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:  1.4min finished


Finished GB

Searching RandomForest
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.3s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   13.4s finished


Finished RandomForest

Searching BG
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.9s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   13.2s finished


Finished BG

Searching AD
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  72 out of  75 | elapsed:    4.9s remaining:    0.2s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:    5.0s finished


Finished AD

Searching kNN
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   35.7s


Finished kNN



[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   56.3s finished


In [100]:
GS_model

[["{'n_estimators': 200, 'min_samples_split': 3, 'max_depth': 2, 'learning_rate': 0.05}",
  0.8914634146341464,
  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                learning_rate=0.05, loss='deviance', max_depth=2,
                max_features=None, max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=3,
                min_weight_fraction_leaf=0.0, n_estimators=200,
                presort='auto', random_state=None, subsample=1.0, verbose=0,
                warm_start=False)],
 ["{'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 3, 'criterion': 'gini'}",
  0.9332317073170732,
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=3, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=4,
              min_

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()
bg = BaggingClassifier()
ad = AdaBoostClassifier()

rf.fit(X_train, y_train)
rf.score(X_test,y_test)
predProb = rf.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.6035121963196208 - raw
#0.5711820261317555 - StandardScalar
#0.6044318818874117 - Standard Scalar without park/water info

0.7445730624028424

In [18]:
gb.fit(X_train, y_train)
gb.score(X_test,y_test)
predProb = gb.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.6867954021859667 - Raw
#0.6733643100619126 - Standard Scalar
#0.7150969608432872 - Standard Scalar without parks/water

0.723614812347324

In [19]:
bg.fit(X_train, y_train)
bg.score(X_test,y_test)
predProb = bg.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5778244270123619 - Raw
#0.6225954160988963 - Standard Scalar
#0.6635925230279445 - Standard Scalar without parks/water

0.7294858982900289

In [20]:
ad.fit(X_train, y_train)
ad.score(X_test,y_test)
predProb = ad.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5726531138364085 - Raw
#0.5756229105030342 - Standard Scalar
#0.626083878597413 -  Standard Scalar without parks/water

0.8067677104152786

In [21]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
preProb = kn.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5726531138364085 - Raw
#0.5756229105030342 - Standard Scalar
#0.626083878597413 -  Standard Scalar without parks/water

0.8067677104152786