# Meta Classifier Tuning

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, StratifiedKFold

cv_labels = np.load('20160422-cv_labels.npy')
cv_preds_stack = pd.read_pickle('20160422-cv_preds_stack.pkl')

In [2]:
params = {
    'n_estimators': randint(10, 200),
    'max_features': uniform(loc=0.25, scale=.74),
    'random_state': [1],
    'n_jobs': [-1]
}
classifier = BaggingClassifier()
search_results = RandomizedSearchCV(estimator = classifier,
                                        param_distributions = params,
                                        n_iter = 100, n_jobs = 1,
                                        cv = 5, verbose = 3 )
# GridLogiClassifier = GridSearchCV(estimator = classifier, param_grid=params, n_jobs=1, cv=4, verbose=2)
search_results.fit(cv_preds_stack, cv_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1 
[CV]  max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1, score=0.949545 -   0.6s
[CV] max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1 
[CV]  max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1, score=0.950132 -   0.6s
[CV] max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1 
[CV]  max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1, score=0.946821 -   0.6s
[CV] max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1 
[CV]  max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1, score=0.947727 -   0.6s
[CV] max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1 
[CV]  max_features=0.439705667307, n_estimators=13, n_jobs=-1, random_state=1, score=0.948358 -   0.6s
[CV] max_features=0.64683975796,

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:   42.3s


[CV]  max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1, score=0.950132 -   0.8s
[CV] max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1 
[CV]  max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1, score=0.947215 -   0.8s
[CV] max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1 
[CV]  max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1, score=0.947806 -   0.8s
[CV] max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1 
[CV]  max_features=0.39341960054, n_estimators=38, n_jobs=-1, random_state=1, score=0.948437 -   0.8s
[CV] max_features=0.649543073036, n_estimators=77, n_jobs=-1, random_state=1 
[CV]  max_features=0.649543073036, n_estimators=77, n_jobs=-1, random_state=1, score=0.950648 -   1.7s
[CV] max_features=0.649543073036, n_estimators=77, n_jobs=-1, random_state=1 
[CV]  max_features=0.649543073036, n_estimators=77, n_jobs=-1, random_state=1, score=0.950250 -   1.6s

[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed:  4.3min


[CV]  max_features=0.255524172986, n_estimators=183, n_jobs=-1, random_state=1, score=0.946308 -   1.7s
[CV] max_features=0.255524172986, n_estimators=183, n_jobs=-1, random_state=1 
[CV]  max_features=0.255524172986, n_estimators=183, n_jobs=-1, random_state=1, score=0.946663 -   1.8s
[CV] max_features=0.255524172986, n_estimators=183, n_jobs=-1, random_state=1 
[CV]  max_features=0.255524172986, n_estimators=183, n_jobs=-1, random_state=1, score=0.947885 -   1.7s
[CV] max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1 
[CV]  max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1, score=0.950609 -   2.8s
[CV] max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1 
[CV]  max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1, score=0.950171 -   2.7s
[CV] max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1 
[CV]  max_features=0.676391373364, n_estimators=146, n_jobs=-1, random_state=1, score

[Parallel(n_jobs=1)]: Done 287 tasks       | elapsed: 10.6min


[CV]  max_features=0.822060803059, n_estimators=131, n_jobs=-1, random_state=1, score=0.946702 -   3.4s
[CV] max_features=0.822060803059, n_estimators=131, n_jobs=-1, random_state=1 
[CV]  max_features=0.822060803059, n_estimators=131, n_jobs=-1, random_state=1, score=0.947294 -   3.2s
[CV] max_features=0.822060803059, n_estimators=131, n_jobs=-1, random_state=1 
[CV]  max_features=0.822060803059, n_estimators=131, n_jobs=-1, random_state=1, score=0.948713 -   3.2s
[CV] max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1 
[CV]  max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1, score=0.950057 -   3.6s
[CV] max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1 
[CV]  max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1, score=0.950408 -   3.6s
[CV] max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1 
[CV]  max_features=0.73531668646, n_estimators=177, n_jobs=-1, random_state=1, score=0.947

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 18.5min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107f2a3d0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107f2a5d0>, 'random_state': [1], 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=3)

In [3]:
search_results.best_score_

0.94910002601764465

In [4]:
search_results.best_params_

{'max_features': 0.6297698699152728,
 'n_estimators': 60,
 'n_jobs': -1,
 'random_state': 1}

## `RandomForestClassifier()`

In [4]:
search_results.best_score_

0.9489186909182652

In [5]:
search_results.best_params_

{'max_depth': None,
 'max_features': 0.38988227030541617,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 112,
 'n_jobs': -1,
 'random_state': 1}

## `BaggingClassifier()`

In [23]:
search_results.best_score_

0.94943115967738123

In [24]:
search_results.best_params_

{'max_features': 0.7268891521595635,
 'n_estimators': 26,
 'n_jobs': -1,
 'random_state': 1}

## 10-fold Cross Validation

In [7]:
def pred_and_error(model, test_data, test_labels):
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.6297698699152728,
                              n_estimators=60,
                              random_state=1, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=4, n_estimators=112,
                                   min_samples_split=2, random_state=1, max_features=0.38988227030541617,
                                   max_depth=None).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_neural(train_data, train_labels, test_data, test_labels):
    model = MLPClassifier(hidden_layer_sizes=1000).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

# kf = KFold(n_folds=10, shuffle=True, random_state=1)
kf = StratifiedKFold(n_folds=10, shuffle=True, random_state=1)
cv_errors = []
for i, (train, test) in enumerate(kf.split(cv_preds_stack, cv_labels)):
    cv_train_data = cv_preds_stack.iloc[train,:]
    cv_train_labels = cv_labels[train]
    cv_test_data = cv_preds_stack.iloc[test,:]
    cv_test_labels = cv_labels[test]
    
    print("Starting fold #{}".format(i+1))
    _, error = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error))
    
    cv_errors.append(error)
    
print(np.mean(cv_errors))

Starting fold #1
Error: 0.0490342924714
Starting fold #2
Error: 0.0493535162409
Starting fold #3
Error: 0.0530589719331
Starting fold #4
Error: 0.0516398612425
Starting fold #5
Error: 0.0520340586566
Starting fold #6
Error: 0.0536896877956
Starting fold #7
Error: 0.0510920129307
Starting fold #8
Error: 0.0509343215328
Starting fold #9
Error: 0.0491997161555
Starting fold #10
Error: 0.0494362532524
0.0509472692212
