In [1]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint as sp_randint
import datasets
from time import time

In [2]:
data=datasets.load_rbm()
scaler = StandardScaler()
X_train = scaler.fit_transform(data.X_train)
Y_train = data.y_train
X_test = scaler.transform(data.X_test)
Y_test = data.y_test
n_feats=X_train.shape[1]

In [11]:
parameters = {
    "base_estimator__max_depth": [None],
    "base_estimator__max_features": ['log2'],
    "n_estimators":[100],
    "learning_rate":[0.001],
}

In [12]:
rndf=RandomForestClassifier(n_estimators=15,n_jobs=-1,random_state=1,warm_start=True,bootstrap=True)
adaboost=AdaBoostClassifier(base_estimator=rndf,random_state=1)

In [13]:
random_search = GridSearchCV(adaboost, param_grid=parameters,n_jobs=-1)

In [14]:
start = time()
random_search.fit(X_train, Y_train)


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=True),
          learning_rate=1.0, n_estimators=50, random_state=1),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'n_estimators': [100], 'learning_rate': [0.001], 'base_estimator__max_depth': [None], 'base_estimator__max_features': ['log2']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [15]:
print("RandomizedSearchCV took %.2f seconds." % ((time() - start)))

RandomizedSearchCV took 240.04 seconds.


In [16]:
from operator import itemgetter
def report(grid_scores, n_top=5):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

print report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.365 (std: 0.002)
Parameters: {'n_estimators': 100, 'learning_rate': 0.001, 'base_estimator__max_depth': None, 'base_estimator__max_features': 'log2'}

None


In [17]:
print 'Test set accuracy:'
print random_search.score(X_test,Y_test)

Test set accuracy:
0.3712


In [18]:
import cPickle
# save the classifier
with open('rbm_adaboost.pkl', 'wb') as fid:
    cPickle.dump(random_search, fid)    