In [3]:
# <api>
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.grid_search import GridSearchCV

from skopt.space import Categorical
import work.marvin.binary_classifier_models.modelfit as modelfit

import logging

logger = logging.getLogger(__name__)

In [None]:
# <api>
def bestModelProducer(data, target, datamapper, figpath=None):
    """
    auto lightgbm model generation, 3 steps:
    1. estimate optimal model parameters space for gridsearch,
       depends on sample size and feature size
    2. run gridsearch to find best parameter set
    3. train the best GBDT model using the best parameter set
    """

    traindf, testdf = modelfit.prepareDataforTraining(data, datamapper)
    datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    # estimate optimal parameters grid space
    configspace = parameterGridInitialization(datamapper.shape)
    lightgbm_best = produceBestLightgbmModel(traindf, datamapper, target, configspace, figpath)
    return lightgbm_best, traindf, testdf

In [None]:
# <api>
def parameterGridInitialization(trainX):
    feature_size = trainX.shape[1] - 1
    train_size = trainX.shape[0]

    n_estimators = [1000]

    subsample_spc = [0.6, 0.7, 0.8, 0.9]
    colsample_bytree_spc = [0.6, 0.7, 0.8, 0.9]
    reg_alpha_spc = [0, 0.01, 0.1, 1, 10]
    reg_lambda_spc = [0.1, 0.3, 0.5]
    learning_rate_spc = [0.01, 0.05, 0.1]

    max_depth_spc = max_depth_space(feature_size)
    min_child_weight_spc = min_child_weight_space(train_size)

    # set learning_rate, run to get optiomal n_estimators
    param_grid1 = {'n_estimators': n_estimators}

    # most important parameters
    param_grid2 = {'max_depth': max_depth_spc,
                   'min_child_weight': min_child_weight_spc,
                   'subsample': subsample_spc,
                   'colsample_bytree': colsample_bytree_spc}

    # regularization parameters
    param_grid3 = {'reg_alpha': reg_alpha_spc, 'reg_lambda': reg_lambda_spc}

    # learning_rate parameters
    param_grid4 = {'learning_rate': learning_rate_spc}

    return param_grid1, param_grid2, param_grid3, param_grid4

In [None]:
# <api>
def configSpaceInitialization(trainX):
    """
       sample configSpace
    """
    if trainX.shape[1] >= 10:
        skopt_grid = {'max_depth': (3, 9),
                      'learning_rate': (0.01, 0.1),
                      'n_estimators': (50, 800),
                      'boosting_type': Categorical(('gbdt',)),
                      'xgboost_dart_mode': Categorical((True,)),
                      'min_child_weight': (1, 5),
                      'subsample_freq': (1, 5),
                      'colsample_bytree': (0.2, 0.9),
                      'reg_alpha': (1, 5),
                      'reg_lambda': (0.1, 0.5),
                      'drop_rate': (0.01, 0.5),
                      'scale_pos_weight': (1, 5),
                      'num_leaves' : (20, 40)}
    else:
        skopt_grid = {'max_depth': (2, 3),
                      'learning_rate': (0.01, 0.1),
                      'n_estimators': (20, 100),
                      'boosting_type': Categorical(('gbdt',)),
                      'xgboost_dart_mode': Categorical((True,)),
                      'min_child_weight': (1, 5),
                      'subsample_freq': (1, 5),
                      'colsample_bytree': (0.9, 1),
                      'reg_alpha': (1, 5),
                      'reg_lambda': (0.1, 0.5),
                      'drop_rate': (0.01, 0.5),
                      'scale_pos_weight': (1, 5),
                      'num_leaves' : (15, 31)}
    return skopt_grid

In [1]:
# <api>
def produceBestLightgbmModel(traindf, datamapper, target,
                             configspace,
                             fig_path=None, seed=27):

    param_grid1, param_grid2, param_grid3, param_grid4 = configspace

    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    train = np.array(train)
    labels_train = traindf[target]

    # running grid search to get the best parameter set
    (best_subsample, best_estimators, best_learning_rate, best_max_depth,
     best_min_child_weight, best_colsample_bytree, best_reg_alpha, best_reg_lambda) =\
        lightGBMGridSearch(train, labels_train,
                           param_grid1, param_grid2, param_grid3, param_grid4)

    # train a gbm using the best parameter set
    lgbm_best = LGBMClassifier(n_estimators=best_estimators, learning_rate=best_learning_rate,
                               max_depth=best_max_depth, min_child_weight=best_min_child_weight,
                               subsample=best_subsample, colsample_bytree=best_colsample_bytree,
                               reg_alpha=best_reg_alpha, reg_lambda=best_reg_lambda,
                               objective='binary', nthread=-1,
                               scale_pos_weight=1, seed=seed)
    return lgbm_best

In [None]:
# <api>
def max_depth_space(feature_size):
    if feature_size > 1000:
        max_depth = range(5, 14, 2)
    else:
        max_depth = range(3, 10, 2)
    return list(max_depth)


# <api>
def min_child_weight_space(train_size):
    if train_size > 10000:
        min_child_weight = range(3, 8, 1)
    else:
        min_child_weight = range(1, 6, 1)
    return list(min_child_weight)

In [None]:
# <api>
# train lightgbm to get best n_estimators
def lightGBMTrainBestn_estimators(alg, train, labels_train,
                                  early_stopping_rounds=50):  
    alg.fit(train, labels_train)
    best_iteration = alg.best_iteration
    return best_iteration

In [None]:
# <api>
def lightGBMGridSearch(train, labels_train,
                       param_grid1, param_grid2, param_grid3, param_grid4, seed=27):
    """
    lightGBM grid search routine
    """
    estimator = LGBMClassifier(max_depth=3, min_child_weight=1,
                               subsample_freq=1, learning_rate=0.1,
                               n_estimators=param_grid1['n_estimators'][0],
                               colsample_bytree=0.8,
                               objective='binary',
                               nthread=-1, scale_pos_weight=1, seed=27)

    best_estimators = lightGBMTrainBestn_estimators(estimator, train, labels_train)

    estimator1 = LGBMClassifier(n_estimators=best_estimators,
                                learning_rate=0.1,
                                objective='binary:logistic',
                                nthread=-1, scale_pos_weight=1,
                                seed=seed)
    gsearch1 = GridSearchCV(estimator=estimator1,
                            param_grid=param_grid2, scoring='roc_auc',
                            n_jobs=1, iid=False, cv=5)
    gsearch1.fit(train, labels_train)

    best_parameters = gsearch1.best_estimator_.get_params()
    best_max_depth = best_parameters["max_depth"]
    best_min_child_weight = best_parameters['min_child_weight']
    best_subsample = best_parameters['subsample_freq']
    best_colsample_bytree = best_parameters['colsample_bytree']
    estimator2 = LGBMClassifier(n_estimators=best_estimators,
                                learning_rate=0.1,
                                max_depth=best_max_depth,
                                min_child_weight=best_min_child_weight,
                                subsample_freq=best_subsample,
                                colsample_bytree=best_colsample_bytree,
                                objective='binary',
                                nthread=-1, scale_pos_weight=1,
                                seed=seed)
    gsearch2 = GridSearchCV(estimator=estimator2,
                            param_grid=param_grid3, scoring='roc_auc',
                            n_jobs=1, iid=False, cv=5)
    gsearch2.fit(train, labels_train)

    best_parameters = gsearch2.best_estimator_.get_params()
    best_reg_alpha = best_parameters["reg_alpha"]
    best_reg_lambda = best_parameters["reg_lambda"]
    
    estimator3 = LGBMClassifier(n_estimators=best_estimators,
                                max_depth=best_max_depth,
                                min_child_weight=best_min_child_weight,
                                subsample_freq=best_subsample,
                                colsample_bytree=best_colsample_bytree,
                                reg_alpha=best_reg_alpha,
                                reg_lambda=best_reg_lambda,
                                objective='binary',
                                nthread=-1, scale_pos_weight=1,
                                seed=seed),
    gsearch3 = GridSearchCV(estimator=estimator3,
                            param_grid=param_grid4, scoring='roc_auc',
                            n_jobs=1, iid=False, cv=5)
    gsearch3.fit(train, labels_train)

    best_parameters = gsearch3.best_estimator_.get_params()
    best_learning_rate = best_parameters["learning_rate"]

    estimator = LGBMClassifier(n_estimators=param_grid1['n_estimators'][0]*2,
                               learning_rate=best_learning_rate,
                               max_depth=best_max_depth,
                               min_child_weight=best_min_child_weight,
                               subsample_freq=best_subsample,
                               colsample_bytree=best_colsample_bytree,
                               reg_alpha=best_reg_alpha,
                               reg_lambda=best_reg_lambda,
                               objective='binary', nthread=-1,
                               scale_pos_weight=1, seed=seed)
    best_estimators = lightGBMTrainBestn_estimators(estimator, train, labels_train)

    return (best_subsample, best_estimators, best_learning_rate, best_max_depth,
            best_min_child_weight, best_colsample_bytree, best_reg_alpha, best_reg_lambda)

In [None]:
# <api>
def produceBestModel(traindf, datamapper, target, configspace, fig_path=None, seed=27):
    return produceBestLightgbmModel(traindf, datamapper, target, configspace, fig_path, seed)

In [None]:
# <api>
def optimizeBestModel(traindf, datamapper, target,
                      configspace, search_alg,
                      fig_path=None, n_calls=100,
                      verbose=0, seed=27):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    train = np.array(train)
    labels_train = traindf[target]
    labels_train = np.array(labels_train)

    # running grid search to get the best parameter set
    best_params, trace = modelfit.searchBestParamsSkopt(train, labels_train,
                                                        configspace, search_alg,
                                                        LGBMClassifier, n_calls)
    # train a gbm using the best parameter set
    lgbm_best = LGBMClassifier(boosting_type='gbdt',
                               n_estimators=best_params['n_estimators'],
                               learning_rate=best_params['learning_rate'],
                               max_depth=best_params['max_depth'],
                               min_child_weight=best_params['min_child_weight'],
                               subsample_freq=best_params['subsample_freq'],
                               colsample_bytree=best_params['colsample_bytree'],
                               reg_alpha=best_params['reg_alpha'],
                               reg_lambda=best_params['reg_lambda'],
                               scale_pos_weight=best_params['scale_pos_weight'],
                               objective='binary',
                               nthread=-1, seed=seed)

    return lgbm_best