In [None]:
# <api>
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

import work.marvin.binary_classifier_models.modelfit as modelfit

import logging

logger = logging.getLogger(__name__)

In [None]:
# <api>
def bestModelProducer(data, target, datamapper, fig_path):
    """
    # auto LR model generation, 3 steps:
    1. estimate optimal model parameters space for gridsearch,
       depends on sample size and feature size
    2. run gridsearch to find best parameter set
    3. train the best LR model using the best parameter set
    """
    traindf, testdf = modelfit.prepareDataforTraining(data, datamapper)
    param_grid = {'penalty': ['l1', 'l2']}

    bestModel, accuracy, auc, cv_score = produceBestLRmodel(traindf, datamapper, target,
                                                            param_grid, fig_path)
    return bestModel

In [None]:
# <api>
def produceBestLRmodel(traindf, datamapper, target, param_grid, fig_path=None, seed=27):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    train = np.array(train)
    labels_train = traindf[target]

    # running grid search to get the best parameter set
    gsearch = GridSearchCV(estimator=LogisticRegression(random_state=seed),
                           param_grid=param_grid,
                           scoring='roc_auc', n_jobs=-1, iid=False, cv=5)
    gsearch.fit(train, labels_train)
    trace = [{"params": grid[0], "loss": grid[1]} for grid in gsearch.grid_scores_]
    best_parameters = gsearch.best_estimator_.get_params()
    best_penalty = best_parameters['penalty']
    logger.debug("best parameters:{}".format(best_parameters))
    best_lr = LogisticRegression(penalty=best_penalty, random_state=seed)
    return best_lr

In [None]:
# <api>
def produceBestModel(traindf, datamapper, target, param_grid, fig_path=None, seed=27):
    return produceBestLRmodel(traindf, datamapper, target, param_grid, fig_path, seed)

In [None]:
# <api>
def optimizeBestModel(traindf, datamapper, target, param_grid=None,
                      search_alg=None, fig_path=None, n_calls=None, seed=27):

    param_grid = {'penalty': ['l1', 'l2']}
    return produceBestLRmodel(traindf, datamapper, target, param_grid,
                              fig_path=fig_path, seed=seed)