In [None]:
# <api>
import math
import numpy as np
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
import work.marvin.binary_classifier_models.modelfit as modelfit

In [None]:
# <api>
def bestModelProducer(data, target, datamapper, fig_path=None):
    """
    # auto GBDT model generation, 3 steps:
    1. estimate optimal model parameters space for gridsearch,
    depends on sample size and feature size
    2. run gridsearch to find best parameter set
    3. train the best GBDT model using the best parameter set
    """
    traindf, testdf = modelfit.prepareDataforTraining(data)
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    
    # estimate optimal parameters grid space
    configspace = parameterGridInitialization(datamapper.shape)
    bestModel = produceBestGBMmodel(traindf, datamapper, target, configspace, figpath)
    return bestModel, traindf, testdf

In [8]:
# <api>
def produceBestGBMmodel(traindf, datamapper, target, configspace, fig_path, seed, verbose):
    
    param_grid1, param_grid2 = configspace
    
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    labels_train = traindf[target]

    # running grid search to get the best parameter set
    (best_subsample, best_estimators, best_learning_rate, best_max_depth,
     best_max_feature, best_min_samples_split) = gbmGridSearch(train,
                                                               labels_train,
                                                               param_grid1,
                                                               param_grid2)

    # train a gbm using the best parameter set
    gbm_best = GradientBoostingClassifier(learning_rate=best_learning_rate,
                                          n_estimators=best_estimators,
                                          max_depth=best_max_depth,
                                          min_samples_split=best_min_samples_split,
                                          subsample=best_subsample,
                                          max_features=best_max_feature,
                                          random_state=seed)
    return gbm_best

In [None]:
# <api>
def n_estimators_space(train_size):
    if train_size > 10000:
        n_estimators_spc = range(200, 1001, 200)
    else:
        n_estimators_spc = range(50, 201, 50)
    return list(n_estimators_spc)

# <api>
def min_samples_split_space(train_size):
    return list(range(min(train_size, 100), min(train_size, 601), 100))

# <api>
def max_feature_space(feature_size):
    fs_sqrt = math.sqrt(feature_size)
    if fs_sqrt > 10:
        max_feature = range(int(fs_sqrt - 3), int(fs_sqrt * 1.50), 2)
    else:
        max_feature = range(int(fs_sqrt), int(fs_sqrt * 1.50), 2)
    return list(max_feature)

# <api>
def max_depth_space(feature_size):
    return [3, 5, 7, 9]

In [9]:
# <api>
def parameterGridInitialization(shape):
    feature_size = shape[1] - 1
    train_size = shape[0]

    subsample_spc = [0.6, 0.7, 0.8, 0.9]
    learning_rate_spc = [0.01, 0.05, 0.1]
    n_estimators_spc = n_estimators_space(train_size)
    min_samples_split_spc = min_samples_split_space(train_size)
    max_feature_spc = max_feature_space(feature_size)
    max_depth_spc = max_depth_space(feature_size)
    min_samples_split_spc = min_samples_split_space(train_size)

    # most important parameters
    param_grid1 = {'subsample': subsample_spc, 'n_estimators': n_estimators_spc,
                   'learning_rate': learning_rate_spc}
    # tree specific parameters
    param_grid2 = {'max_depth': max_depth_spc, 'max_features': max_feature_spc,
                   'min_samples_split': min_samples_split_spc}

    return param_grid1, param_grid2

In [None]:
# <api>
def configSpaceInitialization(shape):
    feature_size = shape[1] - 1
    train_size = shape[0]

    if train_size >= 1000:
        skopt_grid = {'max_features': (2, feature_size),
                      'max_depth': (2, 9),
                      'learning_rate': (0.01, 0.2),
                      'min_samples_split': (50, 500),
                      'n_estimators': (50, 800),
                      'subsample': (0.2, 0.9)}
    else:
        skopt_grid = {'max_features': (2, feature_size),
                      'max_depth': (2, 9),
                      'learning_rate': (0.01, 0.2),
                      'min_samples_split': (20, train_size),
                      'n_estimators': (50, 800),
                      'subsample': (0.2, 0.9)}
    return skopt_grid

In [10]:
# <api>
def gbmGridSearch(train, labels_train, param_grid1, param_grid2, seed=27):
    gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(min_samples_split=30,
                                                                 max_features='sqrt',
                                                                 max_depth=5,
                                                                 random_state=10),
                            param_grid=param_grid1, scoring='roc_auc',
                            n_jobs=-1, pre_dispatch='2*n_jobs', iid=False, cv=5)
    gsearch1.fit(train, labels_train)

    best_parameters = gsearch1.best_estimator_.get_params()
    best_subsample = best_parameters["subsample"]
    best_estimators = best_parameters['n_estimators']
    best_learning_rate = best_parameters['learning_rate']

    gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(subsample=best_subsample,
                                                                 n_estimators=best_estimators,
                                                                 learning_rate=best_learning_rate,
                                                                 random_state=seed),
                            param_grid=param_grid2, scoring='roc_auc',
                            n_jobs=-1, pre_dispatch='2*n_jobs', iid=False, cv=5)
    gsearch2.fit(train, labels_train)

    best_parameters2 = gsearch2.best_estimator_.get_params()
    best_max_depth = best_parameters2["max_depth"]
    best_max_feature = best_parameters2["max_features"]
    best_min_samples_split = best_parameters2["min_samples_split"]

    return (best_subsample, best_estimators, best_learning_rate,
            best_max_depth, best_max_feature, best_min_samples_split)

In [None]:
# <api>
def produceBestModel(traindf, datamapper, target, configspace, fig_path=None, seed=27, verbose=0):
    return produceBestGBMmodel(traindf, datamapper, target, configspace, fig_path, seed, verbose=0)

In [None]:
# <api>
def optimizeBestModel(traindf, datamapper, target,
                      configspace, search_alg,
                      fig_path=None, n_calls=100,
                      verbose=0, seed=27):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    labels_train = np.array(traindf[target])

    # running skopt.gbrt_search to get the best parameter set
    best_params, trace = modelfit.searchBestParamsSkopt(train, labels_train,
                                                        configspace, search_alg,
                                                        GradientBoostingClassifier, n_calls)
    
    # search_alg: skopt_gbrt_search, skopt_gp_search, skopt_forest_search
    gbdt_best = GradientBoostingClassifier(learning_rate=best_params['learning_rate'],
                                           n_estimators=best_params['n_estimators'],
                                           max_depth=best_params['max_depth'],
                                           min_samples_split=best_params['min_samples_split'],
                                           subsample=best_params['subsample'],
                                           max_features=best_params['max_features'],
                                           random_state=seed)
    return gbdt_best