In [50]:
%run -i ProcessPipeline.py

In [77]:
%%time

FullTrain = pd.read_csv('./input/train.csv')
Ftest = pd.read_csv('./input/test.csv', index_col= 0)

FullTrain , Ftestpp = ProcessPipeline(FullTrain,Ftest)

0 (116293, 10)
1 (116293, 13)
3 (116293, 21)
2 (116293, 38)
Wall time: 3min 46s


In [78]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [79]:
Ftest2 = Ftestpp.drop(['Date','Species','Trap','Location'], axis = 'columns')

In [6]:
FullTrain.columns

Index(['NumMosquitos', 'WnvPresent', 'Date', 'Species', 'Trap', 'Latitude',
       'Longitude', 'Location', 'CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
       'CULEX RESTUANS', 'CULEX SALINARIUS', 'CULEX TERRITANS',
       'CULEX TARSALIS', 'CULEX ERRATICUS', 'DewPointamax', 'DewPointamin',
       'DewPointmean', 'StnPressureamax', 'StnPressureamin', 'StnPressuremean',
       'AvgSpeedamax', 'AvgSpeedamin', 'AvgSpeedmean', 'Tmaxamax', 'Tminamin',
       'Tavgmean', 'PrecipTotalsum', 'PrecipTotalmean', 'temp_expected',
       'temp_diff', 'sunset', 'W0', 'W1', 'W2', 'W3', 'P0', 'P1', 'P2', 'P3'],
      dtype='object')

In [80]:
target = FullTrain['WnvPresent']
toDrop = ['Date', 'WnvPresent','Location','NumMosquitos','Species',
          'Trap']

X = FullTrain.drop(toDrop, axis = 'columns')
X_train0, X_test0, y_train, y_test = train_test_split(X, target, test_size = .2)

ss = StandardScaler()
X_train = ss.fit_transform(X_train0)
X_test = ss.transform(X_test0)
Xss = ss.fit_transform(X)
Ftest2 = ss.transform(Ftest2)

In [65]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV


def best_model(model_list, X_train, y_train):
    """
    Runs a randomized search of 10 iters, and strat-kfold cv
    And produces a list of classifiers - parameters - scores
    for the best of each type of model
    
    To change it to ONLY spit out the best model, replace 
    
    'return classifier' 
    
    for
    
    'return best_classifier[1]'
    """
    best_score = 0.0
    best_classifier = None
    classifiers = []
    for name, model, parameters in model_list:
        classifiers.append(best_config(name, model, parameters,
                                       X_train,
                                       y_train))
 
    for name, score, classifier in classifiers:
        if (score > best_score):
            best_score = score
            best_classifier = [name, classifier]
    return best_classifier[1]

def best_config(name, model, parameters, X_train, y_train):
    """
    A simple GridSearch 
    Requires: model, parameters, X & y variables
    +
    """
    k = StratifiedKFold(n_splits=5)
    print("Searching "+str(name))
    clf = RandomizedSearchCV(model, parameters[0], cv=k, 
                            n_iter=15, verbose=1, 
                             n_jobs=2)
    clf.fit(X_train, y_train)
    print("Finished "+str(name))
    print('')
    return [str(clf.best_params_), 
            clf.best_score_, 
            clf.best_estimator_]

def model_list():
    
    """
    Generates a list of models & params,
    Add more manually, for more options
    gb = GradientBoostingClassifier()
    rf = RandomForestClassifier()
    bg = BaggingClassifier()
    ad = AdaBoostClassifier()
    knn = KNeighborsClassifier()
    """
    
    models = []    

    #gradient
    gb_tuned_parameters = [{"n_estimators": [50, 75, 100, 150, 200, 250],
                           'learning_rate' : [0.05, 0.1, 0.2], 
                            'min_samples_split' : [2, 3, 4], 
                            'max_depth' : [2, 3, 4, 5]}]
    models.append(['GB', GradientBoostingClassifier(), gb_tuned_parameters])
 
    #random forest
    rf_tuned_parameters = [{"max_depth": [None, 2, 3, 4, 5],
                           'criterion' : ['gini'], 
                            'min_samples_split' : [2, 3, 4], 
                            'min_samples_leaf' : [1, 2, 3]}]
    models.append(["RandomForest",RandomForestClassifier(n_jobs=-1),rf_tuned_parameters])

    #Bagged
    bg_tuned_parameters = [{"n_estimators": [x for x in range(2,20)]}]
    models.append(['BG', BaggingClassifier(), bg_tuned_parameters])

    #Ada Model
    ad_tuned_parameters = [{"n_estimators": [x for x in range(2,20)]}]
    models.append(['AD', AdaBoostClassifier(), ad_tuned_parameters])
    
    #kNN Model
    knn_tuned_parameters = [{"n_neighbors": [1, 3, 5, 10, 20], 
                             'weights':['uniform','distance'],
                            'metric':['euclidean','manhattan']}]
    models.append(["kNN", KNeighborsClassifier(),knn_tuned_parameters])
    return models

In [66]:
GS_model = best_model(model_list(),X_train,y_train)

Searching GB
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   33.5s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   50.0s finished


Finished GB

Searching RandomForest
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   15.5s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   20.5s finished


Finished RandomForest

Searching BG
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   17.2s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   21.3s finished


Finished BG

Searching AD
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  58 tasks      | elapsed:   10.6s
[Parallel(n_jobs=2)]: Done  72 out of  75 | elapsed:   11.4s remaining:    0.4s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   11.5s finished


Finished AD

Searching kNN
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   59.9s


Finished kNN



[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:  1.6min finished


In [67]:
GS_model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=250,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [14]:
from sklearn.metrics import roc_auc_score
gb= GradientBoostingClassifier(n_estimators = 75,
                               min_samples_split= 4, max_depth = 3, 
                              learning_rate = .05)
gb.fit(X_train,y_train)
predProb = gb.predict_proba(X_test)
roc_auc_score(y_test, predProb[:,1])

0.80082970244932272

In [81]:
rf = RandomForestClassifier(n_jobs = -1, n_estimators = 200)
rf.fit(X, target)
predProb = rf.predict_proba(X)
roc_auc_score(target, predProb[:,1])

testProb = rf.predict_proba(Ftest2)[:,1]

In [82]:
roc_auc_score(target, predProb[:,1])

1.0

In [17]:
bg = AdaBoostClassifier()
bg.fit(X_train,y_train)
predProb = bg.predict_proba(Ftest2)
roc_auc_score(y_test, predProb[:,1])

0.82247704361552632

In [26]:
Ftest.shape

(112650, 38)

In [25]:
Ftest2.shape

(112650, 32)

In [54]:
predProb = bg.predict_proba(Ftest2)[:,1]

In [83]:
dfs = pd.DataFrame(testProb)
dfs.columns = ['WnvPresent']
dfs['Id'] = range(1,len(dfs)+1)
dfs.index = range(1, len(dfs)+1)
dfs = dfs[['Id','WnvPresent']]
dfs.to_csv('preds3.csv', index = False)

In [None]:
predProb = bg.predict_proba()