### Hyperparameter tuning

In [None]:
def grid_search_cv(clf, X, y, param_grid, scoring="roc_auc"):    
    search = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=10, scoring=scoring)
    search_result = search.fit(X, y)

    print("Best: %f using %s" % (search_result.best_score_, search_result.best_params_))
    means = search_result.cv_results_['mean_test_score']
    stds = search_result.cv_results_['std_test_score']
    params = search_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

**Select the number of trees**

In [None]:
param_grid = {'classifier__n_estimators': [50, 100, 250, 500, 750, 1000, 3000]}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier()
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

In [None]:
param_grid = {'classifier__n_estimators': [150, 200, 250, 300, 350, 400, 450]}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier()
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

**Tree specific parameters**

In [None]:
param_grid = {'classifier__max_depth':range(3,16,2)}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier(n_estimators=200)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

In [None]:
param_grid = {'classifier__min_samples_split':range(5,101,5), 
              'classifier__min_samples_leaf':range(5,101,5)}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier(n_estimators=200, max_depth=3)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

**Subsampling**

In [None]:
param_grid = {"classifier__subsample": [0.5, 0.75, 1.0]}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.09)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

**Max depth**

In [None]:
param_grid = {"classifier__max_depth": np.linspace(1, 10, 10)}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.09, subsample=0.75)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

**Fine tune the learning rate**

In [None]:
param_grid = {"classifier__learning_rate": np.linspace(2/200, 20/200, 10)}

preprocessor = utils.define_preprocessor(X_train.columns)
clf = GradientBoostingClassifier(n_estimators=200)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

grid_search_cv(pipe, X_train, y_train, param_grid)

In [None]:
# class DictDist():
#     def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
#     def rvs(self, n):
#         a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
#         out = []
#         for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
#         return out
    
# class Choice():
#     def __init__(self, options): self.options = options
#     def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [None]:
# N = 15
# SEED = 1443
# RF_dist = DictDist({
#     'n_estimators': ss.randint(50, 500),
#     'max_depth': ss.randint(2, 10),
#     'min_samples_split': ss.randint(2, 75),
#     'min_samples_leaf': ss.randint(1, 50),
# })
# np.random.seed(SEED)
# RF_hyperparams_list = RF_dist.rvs(N)