# Meta Classifier Tuning

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

cv_labels = np.load('20160421-cv_labels.npy')
cv_preds_stack = pd.read_pickle('20160421-cv_preds_stack.pkl')

In [3]:
params = {
    'n_estimators': randint(10, 400),
    'max_features': uniform(loc=0.25, scale=.74),
    'max_depth': [None],
    'min_samples_split': randint(2, 5),
    'min_samples_leaf': randint(1, 5),
    'random_state': [1],
    'n_jobs': [-1]
}
classifier = RandomForestClassifier()
search_results = RandomizedSearchCV(estimator = classifier,
                                        param_distributions = params,
                                        n_iter = 100, n_jobs = 1,
                                        cv = 5, verbose = 3 )
# GridLogiClassifier = GridSearchCV(estimator = classifier, param_grid=params, n_jobs=1, cv=4, verbose=2)
search_results.fit(cv_preds_stack, cv_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None, score=0.950491 -   1.5s
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None, score=0.948752 -   1.4s
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=81, min_samples_split=3, random_state=1, max_features=0.397746422396, max_depth=None, score=0.946821 -   1.4s
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=81,

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed:  8.0min



[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None, score=0.947649 -   1.9s
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None, score=0.948003 -   1.9s
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=106, min_samples_split=3, random_state=1, max_features=0.448819477569, max_depth=None, score=0.948792 -   1.9s
[CV] n_jobs=-1, min_samples_leaf=3, n_estimators=161, min_samples_split=2, random_state=1, max_features=0.39

[Parallel(n_jobs=1)]: Done 287 tasks       | elapsed: 17.6min
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 31.5min finished





RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_jobs': [-1], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x112d46210>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x112d3ead0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x112d3eed0>, 'random_state': [1], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x112d3ecd0>, 'max_depth': [None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scor

## `RandomForestClassifier()`

In [4]:
search_results.best_score_

0.9489186909182652

In [5]:
search_results.best_params_

{'max_depth': None,
 'max_features': 0.38988227030541617,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 112,
 'n_jobs': -1,
 'random_state': 1}

## `BaggingClassifier()`

In [23]:
search_results.best_score_

0.94943115967738123

In [24]:
search_results.best_params_

{'max_features': 0.7268891521595635,
 'n_estimators': 26,
 'n_jobs': -1,
 'random_state': 1}

## 10-fold Cross Validation

In [6]:
def pred_and_error(model, test_data, test_labels):
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.7268891521595635,
                              n_estimators=26,
                              random_state=1, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=4, n_estimators=112,
                                   min_samples_split=2, random_state=1, max_features=0.38988227030541617,
                                   max_depth=None).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

kf = KFold(n_folds=10, shuffle=True, random_state=1)

cv_errors = []
for i, (train, test) in enumerate(kf.split(cv_preds_stack)):
    cv_train_data = cv_preds_stack.iloc[train,:]
    cv_train_labels = cv_labels[train]
    cv_test_data = cv_preds_stack.iloc[test,:]
    cv_test_labels = cv_labels[test]
    
    print("Starting fold #{}".format(i+1))
    preds_3, error_3 = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_3))
    
    cv_errors.append(error_3)
    
print(np.mean(cv_errors))

Starting fold #1
Error: 0.0508514664144
Starting fold #2
Error: 0.0507726269316
Starting fold #3
Error: 0.0507726269316
Starting fold #4
Error: 0.054793440555
Starting fold #5
Error: 0.0491958372753
Starting fold #6
Error: 0.0504572690003
Starting fold #7
Error: 0.0532166508988
Starting fold #8
Error: 0.0505400930379
Starting fold #9
Error: 0.0483324134668
Starting fold #10
Error: 0.0488843333596
0.0507816757871
