### Model Building Starter

Below are all the relevant dataframes

Only thing that remains to be done is aggregate for when there are multiple observations for a single trap/day/species  


`df_w` include all the aggregated weather  
`df_a` includes weather and park/water data (note park/water data are svd features  
`df_s` includes all the spray data (so only years 2011/13) with categorically encoded "spray before" and "spray after" Variables  


No train/test splits have been done, but what we discussed was as standard train/test split for the spray data, and 4-fold CV for the other data, split by year (there is a 'yr' column in the dfs)

In [1]:
import pandas as pd
import pickle

with open('allDF.pickle','rb') as f:
    allDF = pickle.load(f)
    
df_w = allDF[0]
df_a = allDF[1]
df_s = allDF[2]

In [2]:
df = df_a

In [13]:
df.columns

Index(['NumMosquitos', 'WnvPresent', 'Date', 'Species', 'Trap', 'Location',
       'Yr', 'Week', 'Date_end', 'DewPointamax', 'DewPointamin',
       'DewPointmean', 'StnPressureamax', 'StnPressureamin', 'StnPressuremean',
       'AvgSpeedamax', 'AvgSpeedamin', 'AvgSpeedmean', 'temp_max', 'temp_min',
       'temp_avg', 'precip_total', 'precip_avg', 'Park0', 'Park1', 'Park2',
       'Park3', 'Park4', 'Park5', 'Water0', 'Water1', 'Water2', 'Water3',
       'Water4', 'Water5'],
      dtype='object')

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [4]:
test = df[df['Yr']==2009]
train = df[df['Yr']!= 2009]

y_train = train['WnvPresent']
y_test = test['WnvPresent']

toDrop = ['Date','WnvPresent','Location','NumMosquitos', 'Species','Trap','Location','Yr','Date_end',
          'DewPointamax', 'DewPointamin','temp_max', 'temp_min',
          'Park0', 'Park1', 'Park2','Park3', 'Park4', 'Park5', 
          'Water0', 'Water1', 'Water2', 'Water3','Water4', 'Water5'
         ]
X_train0 = train.drop(toDrop, axis = 'columns')
#X_train = X_train.iloc[:,4:] # dropping species, trap, date
X_test0 = test.drop(toDrop, axis = 'columns')
#X_test = X_test.iloc[:,4:] # dropping species, trap, date

In [5]:
X_train = ss.fit_transform(X_train0)
X_test = ss.transform(X_test0)

In [46]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def best_config(model, parameters, X_train, y_train, cv=None, n_jobs = 2):
    """
    A simple GridSearch 
    Requires: model, parameters, X & y variables
    +
    """
    clf = GridSearchCV(model, parameters, cv=cv, verbose=1, n_jobs=n_jobs)
    clf.fit(X_train, y_train)
    print("Finished "+str(model)[0:15])
    print('')
    best_estimator = clf.best_estimator_ 
    return [str(clf.best_params_), clf.best_score_,
            best_estimator]


def best_model(model_list, X_train, y_train):
    """
    Runs all the gridsearchs and produces a list of ALL results, returns only best_classifier
    """
    best_score = 0.0
    best_classifier = None
    classifiers = []
    for name, model, parameters in model_list:
        classifiers.append(best_config(model, parameters,
                                       X_train,
                                       y_train))
 
    for name, score, classifier in classifiers:
        if (score > best_score):
            best_score = score
            best_classifier = [name, classifier]
    return best_classifier[1]
 
    
    
def model_list():
    """
    Generates a list of models & params,
    Add more manually, for more options
    gb = GradientBoostingClassifier()
    rf = RandomForestClassifier()
    bg = BaggingClassifier()
    ad = AdaBoostClassifier()
    knn = KNeighborsClassifier()
    """
    models = []    
    
    #gradient
    gb_tuned_parameters = [{"n_estimators": [100 ,250]}]
    models.append(['GB', GradientBoostingClassifier(), gb_tuned_parameters])
 
    #random forest
    rf_tuned_parameters = [{"max_depth": [None]}]
    models.append(["RandomForest",RandomForestClassifier(n_jobs=-1),rf_tuned_parameters])

    #Bagged
    bg_tuned_parameters = [{"n_estimators": [10, 25]}]
    models.append(['GB', BaggingClassifier(), bg_tuned_parameters])

    #Ada Model
    ad_tuned_parameters = [{"n_estimators": [50, 100]}]
    models.append(['GB', AdaBoostClassifier(), ad_tuned_parameters])
    
    
    #kNN Model
    knn_tuned_parameters = [{"n_neighbors": [1, 3, 5, 10, 20], 
                             'weights':['uniform','distance'],
                            'metric':['euclidean','manhattan']}]
    models.append(["kNN", KNeighborsClassifier(),knn_tuned_parameters])
    return models

In [47]:
GS_model = best_model(model_list(),X_train,y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    5.5s finished


Finished GradientBoostin

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.8s finished


Finished RandomForestCla

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    3.4s finished


Finished BaggingClassifi

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    3.1s finished


Finished AdaBoostClassif

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   25.0s


Finished KNeighborsClass



[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   40.4s finished


In [48]:
GS_model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()
bg = BaggingClassifier()
ad = AdaBoostClassifier()

rf.fit(X_train, y_train)
rf.score(X_test,y_test)
predProb = rf.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.6035121963196208 - raw
#0.5711820261317555 - StandardScalar
#0.6044318818874117 - Standard Scalar without park/water info

0.7445730624028424

In [18]:
gb.fit(X_train, y_train)
gb.score(X_test,y_test)
predProb = gb.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.6867954021859667 - Raw
#0.6733643100619126 - Standard Scalar
#0.7150969608432872 - Standard Scalar without parks/water

0.723614812347324

In [19]:
bg.fit(X_train, y_train)
bg.score(X_test,y_test)
predProb = bg.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5778244270123619 - Raw
#0.6225954160988963 - Standard Scalar
#0.6635925230279445 - Standard Scalar without parks/water

0.7294858982900289

In [20]:
ad.fit(X_train, y_train)
ad.score(X_test,y_test)
predProb = ad.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5726531138364085 - Raw
#0.5756229105030342 - Standard Scalar
#0.626083878597413 -  Standard Scalar without parks/water

0.8067677104152786

In [21]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
preProb = kn.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])
#0.5726531138364085 - Raw
#0.5756229105030342 - Standard Scalar
#0.626083878597413 -  Standard Scalar without parks/water

0.8067677104152786