In [13]:
import sys
import csv
import os
## hyperopt
# from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.sparse import hstack

from sklearn.datasets import load_svmlight_file, dump_svmlight_file

In [2]:
def check_model(models, feat_name):
    if models == "all":
        return True
    for model in models:
        if model in feat_name:
            return True
    return False

In [3]:
## xgboost
xgb_random_seed = 2017
xgb_nthread = 2
xgb_dmatrix_silent = True

## sklearn
skl_random_seed = 2017
skl_n_jobs = 2

xgb_min_num_round = 10
xgb_max_num_round = 500
xgb_num_round_step = 10
skl_min_n_estimators = 10
skl_max_n_estimators = 500
skl_n_estimators_step = 10
libfm_min_iter = 10
libfm_max_iter = 500
iter_step = 10
hyperopt_param = {}
hyperopt_param["xgb_max_evals"] = 200
hyperopt_param["rf_max_evals"] = 200
hyperopt_param["etr_max_evals"] = 200
hyperopt_param["gbm_max_evals"] = 200
hyperopt_param["lr_max_evals"] = 200
hyperopt_param["ridge_max_evals"] = 200
hyperopt_param["lasso_max_evals"] = 200
hyperopt_param['svr_max_evals'] = 200
hyperopt_param['dnn_max_evals'] = 200
hyperopt_param['libfm_max_evals'] = 200
hyperopt_param['rgf_max_evals'] = 200

In [4]:
output_path = "../../Output"
log_path = "%s/Log" % output_path
if not os.path.exists(log_path):
    os.makedirs(log_path)

In [5]:
specified_models = "[Pre@solution]_[Feat@svd100_and_bow_Jun27]_[Model@reg_xgb_linear]"
## regression with linear booster
param_space_reg_xgb_linear = {
    'task': 'regression',
    'booster': 'gblinear',
    'objective': 'reg:linear',
    'eta' : hp.quniform('eta', 0.01, 1, 0.01),
    'lambda' : hp.quniform('lambda', 0, 5, 0.05),
    'alpha' : hp.quniform('alpha', 0, 0.5, 0.005),
    'lambda_bias' : hp.quniform('lambda_bias', 0, 3, 0.1),
    'num_round' : hp.quniform('num_round', xgb_min_num_round, xgb_max_num_round, xgb_num_round_step),
    'nthread': xgb_nthread,
    'silent' : 1,
    'seed': xgb_random_seed,
    "max_evals": hyperopt_param["xgb_max_evals"],
}
param_space = param_space_reg_xgb_linear
feat_name = specified_models
feat_folder = "../data/crowdflower-search-relevance/Feat/solution/svd100_and_bow_Jun27"

In [6]:
log_file = "%s/%s_hyperopt.log" % (log_path, feat_name)
log_handler = open( log_file, 'w' )
writer = csv.writer(log_handler)
headers = [ 'trial_counter', 'kappa_mean', 'kappa_std' ]
for k,v in sorted(param_space.items()):
    headers.append(k)
writer.writerow(headers)
log_handler.flush()    

In [10]:
from utils import *

### global params
## you can use bagging to stabilize the predictions
bootstrap_ratio = 1
bootstrap_replacement = False
bagging_size= 1

ebc_hard_threshold = False
verbose_level = 1


#### train CV and final model with a specified parameter setting
def hyperopt_obj(param, feat_folder, feat_name, trial_counter):
    n_runs = 3
    n_folds = 3
    kappa_cv = np.zeros((n_runs, n_folds), dtype=float)
    for run in range(1,n_runs+1):
        for fold in range(1,n_folds+1):
            rng = np.random.RandomState(2017 + 1000 * run + 10 * fold)
            #### all the path
            path = "%s/Run%d/Fold%d" % (feat_folder, run, fold)
            save_path = "%s/Run%d/Fold%d" % (output_path, run, fold)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            # feat
            feat_train_path = "%s/train.feat" % path
            feat_valid_path = "%s/valid.feat" % path
            # weight
            weight_train_path = "%s/train.feat.weight" % path
            weight_valid_path = "%s/valid.feat.weight" % path
            # info
            info_train_path = "%s/train.info" % path
            info_valid_path = "%s/valid.info" % path
            # cdf
            cdf_valid_path = "%s/valid.cdf" % path
            # raw prediction path (rank)
            raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
            rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
            ## load feat
            X_train, labels_train = load_svmlight_file(feat_train_path)
            X_valid, labels_valid = load_svmlight_file(feat_valid_path)
            if X_valid.shape[1] < X_train.shape[1]:
                X_valid = hstack([X_valid, np.zeros((X_valid.shape[0], X_train.shape[1]-X_valid.shape[1]))])
            elif X_valid.shape[1] > X_train.shape[1]:
                X_train = hstack([X_train, np.zeros((X_train.shape[0], X_valid.shape[1]-X_train.shape[1]))])
            X_train = X_train.tocsr()
            X_valid = X_valid.tocsr()
            ## load weight
            weight_train = np.loadtxt(weight_train_path, dtype=float)
            weight_valid = np.loadtxt(weight_valid_path, dtype=float)

            ## load valid info
            info_train = pd.read_csv(info_train_path)
            numTrain = info_train.shape[0]
            info_valid = pd.read_csv(info_valid_path)
            numValid = info_valid.shape[0]
            Y_valid = info_valid["median_relevance"]
            ## load cdf
            cdf_valid = np.loadtxt(cdf_valid_path, dtype=float)
            ## make evalerror func
            evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid)
            evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid)
            evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid)
            evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold)
            evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid)
            ##############
            ## Training ##
            ##############
            ## you can use bagging to stabilize the predictions
            preds_bagging = np.zeros((numValid, bagging_size), dtype=float)
            for n in range(bagging_size):
                if bootstrap_replacement:
                    sampleSize = int(numTrain*bootstrap_ratio)
                    index_base = rng.randint(numTrain, size=sampleSize)
                    index_meta = [i for i in range(numTrain) if i not in index_base]
                else:
                    randnum = rng.uniform(size=numTrain)
                    index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio]
                    index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio]
                if "booster" in param:
                    dvalid_base = xgb.DMatrix(X_valid, label=labels_valid, weight=weight_valid)
                    dtrain_base = xgb.DMatrix(X_train[index_base], label=labels_train[index_base], weight=weight_train[index_base])
                        
                    watchlist = []
                    if verbose_level >= 2:
                        watchlist  = [(dtrain_base, 'train'), (dvalid_base, 'valid')]
                ## regression & pairwise ranking with xgboost
                bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_regrank_valid)
                pred = bst.predict(dvalid_base) 
                ## weighted averageing over different models
                pred_valid = pred
                ## this bagging iteration
                preds_bagging[:,n] = pred_valid
                pred_raw = np.mean(preds_bagging[:,:(n+1)], axis=1)
                pred_rank = pred_raw.argsort().argsort()
                pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True)
                kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid)
                if (n+1) != bagging_size:
                    print("              {:>3}   {:>3}   {:>3}   {:>6}   {} x {}".format(
                                run, fold, n+1, np.round(kappa_valid,6), X_train.shape[0], X_train.shape[1]))
                else:
                    print("                    {:>3}       {:>3}      {:>3}    {:>8}  {} x {}".format(
                                run, fold, n+1, np.round(kappa_valid,6), X_train.shape[0], X_train.shape[1]))
            kappa_cv[run-1,fold-1] = kappa_valid
            ## save this prediction
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw})
            dfPred.to_csv(raw_pred_valid_path, index=False, header=True,
                         columns=["target", "prediction"])
            ## save this prediction
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank})
            dfPred.to_csv(rank_pred_valid_path, index=False, header=True,
                         columns=["target", "prediction"])
    kappa_cv_mean = np.mean(kappa_cv)
    kappa_cv_std = np.std(kappa_cv)
    if verbose_level >= 1:
        print("              Mean: %.6f" % kappa_cv_mean)
        print("              Std: %.6f" % kappa_cv_std)
    ####################
    #### Retraining ####
    ####################
    #### all the path
    path = "%s/All" % (feat_folder)
    save_path = "%s/All" % output_path
    subm_path = "%s/Subm" % output_path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(subm_path):
        os.makedirs(subm_path)
    # feat
    feat_train_path = "%s/train.feat" % path
    feat_test_path = "%s/test.feat" % path
    # weight
    weight_train_path = "%s/train.feat.weight" % path
    # info
    info_train_path = "%s/train.info" % path
    info_test_path = "%s/test.info" % path
    # cdf
    cdf_test_path = "%s/test.cdf" % path
    # raw prediction path (rank)
    raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # submission path (relevance as in [1,2,3,4])
    subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, kappa_cv_mean, kappa_cv_std)

    #### load data
    ## load feat
    X_train, labels_train = load_svmlight_file(feat_train_path)
    X_test, labels_test = load_svmlight_file(feat_test_path)
    if X_test.shape[1] < X_train.shape[1]:
        X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))])
    elif X_test.shape[1] > X_train.shape[1]:
        X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))])
    X_train = X_train.tocsr()
    X_test = X_test.tocsr()
    ## load train weight
    weight_train = np.loadtxt(weight_train_path, dtype=float)
    ## load test info
    info_train = pd.read_csv(info_train_path)
    numTrain = info_train.shape[0]
    info_test = pd.read_csv(info_test_path)
    numTest = info_test.shape[0]
    id_test = info_test["id"]
    
    ## load cdf
    cdf_test = np.loadtxt(cdf_test_path, dtype=float)  
    ##
    evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test)
    evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test)
    evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test)
    evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold)
    evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test)

    ## bagging
    preds_bagging = np.zeros((numTest, bagging_size), dtype=float)
    for n in range(bagging_size):
        if bootstrap_replacement:
            sampleSize = int(numTrain*bootstrap_ratio)
            #index_meta = rng.randint(numTrain, size=sampleSize)
            #index_base = [i for i in range(numTrain) if i not in index_meta]
            index_base = rng.randint(numTrain, size=sampleSize)
            index_meta = [i for i in range(numTrain) if i not in index_base]
        else:
            randnum = rng.uniform(size=numTrain)
            index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio]
            index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio]
 
        if "booster" in param:
            dtest = xgb.DMatrix(X_test, label=labels_test)
            dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base], weight=weight_train[index_base])
                
            watchlist = []
            if verbose_level >= 2:
                watchlist  = [(dtrain, 'train')]
        bst = xgb.train(param, dtrain, param['num_round'], watchlist, feval=evalerror_regrank_test)
        pred = bst.predict(dtest)
        
        ## weighted averageing over different models
        pred_test = pred
        preds_bagging[:,n] = pred_test
    pred_raw = np.mean(preds_bagging, axis=1)
    pred_rank = pred_raw.argsort().argsort()
    #
    ## write
    output = pd.DataFrame({"id": id_test, "prediction": pred_raw})    
    output.to_csv(raw_pred_test_path, index=False)

    ## write
    output = pd.DataFrame({"id": id_test, "prediction": pred_rank})    
    output.to_csv(rank_pred_test_path, index=False)

    ## write score
    pred_score = getScore(pred, cdf_test)
    output = pd.DataFrame({"id": id_test, "prediction": pred_score})    
    output.to_csv(subm_path, index=False)
    #"""
        
    return kappa_cv_mean, kappa_cv_std

In [11]:
## integer features
int_feat = ["num_round", "n_estimators", "max_depth", "degree",
            "hidden_units", "hidden_layers", "batch_size", "nb_epoch",
            "dim", "iter",
            "max_leaf_forest", "num_iteration_opt", "num_tree_search", "min_pop", "opt_interval"]

def hyperopt_wrapper(param, feat_folder, feat_name):
    global trial_counter
    global log_handler
    trial_counter += 1

    # convert integer feat
    for f in int_feat:
        if f in param:
            param[f] = int(param[f])

    print("------------------------------------------------------------")
    print("Trial %d" % trial_counter)

    print("        Model")
    print("              %s" % feat_name)
    print("        Param")
    for k,v in sorted(param.items()):
        print("              %s: %s" % (k,v))
    print("        Result")
    print("                    Run      Fold      Bag      Kappa      Shape")

    ## evaluate performance
    ## 关键是这一步骤
    kappa_cv_mean, kappa_cv_std = hyperopt_obj(param, feat_folder, feat_name, trial_counter)

    ## log
    var_to_log = [
        "%d" % trial_counter,
        "%.6f" % kappa_cv_mean, 
        "%.6f" % kappa_cv_std
    ]
    for k,v in sorted(param.items()):
        var_to_log.append("%s" % v)
    writer.writerow(var_to_log)
    log_handler.flush()

    return {'loss': -kappa_cv_mean, 'attachments': {'std': kappa_cv_std}, 'status': STATUS_OK}

In [None]:
print("************************************************************")
print("Search for the best params")
#global trial_counter
trial_counter = 0
trials = Trials()
objective = lambda p: hyperopt_wrapper(p, feat_folder, feat_name)
best_params = fmin(objective, param_space, algo=tpe.suggest,
                           trials=trials, max_evals=param_space["max_evals"])
# for f in int_feat:
#     if best_params.has_key(f):
#         best_params[f] = int(best_params[f])
# print("************************************************************")
# print("Best params")
# for k,v in best_params.items():
#     print("        %s: %s" % (k,v))
# trial_kappas = -np.asarray(trials.losses(), dtype=float)
# best_kappa_mean = max(trial_kappas)
# ind = np.where(trial_kappas == best_kappa_mean)[0][0]
# best_kappa_std = trials.trial_attachments(trials.trials[ind])['std']
# print("Kappa stats")
# print("        Mean: %.6f\n        Std: %.6f" % (best_kappa_mean, best_kappa_std))

************************************************************
Search for the best params
------------------------------------------------------------
Trial 1
        Model
              [Pre@solution]_[Feat@svd100_and_bow_Jun27]_[Model@reg_xgb_linear]
        Param
              alpha: 0.21
              booster: gblinear
              eta: 0.06
              lambda: 4.7
              lambda_bias: 0.2
              max_evals: 200
              nthread: 2
              num_round: 230
              objective: reg:linear
              seed: 2017
              silent: 1
              task: regression
        Result
                    Run      Fold      Bag      Kappa      Shape
                      1         1        1     0.67035  3470 x 124790
                      1         2        1    0.665376  3393 x 125458
                      1         3        1    0.660125  3295 x 115835
                      2         1        1    0.660489  3470 x 126995
                      2         2    

                      1         1        1    0.672687  3470 x 124790
                      1         2        1    0.666445  3393 x 125458
                      1         3        1    0.660734  3295 x 115835
                      2         1        1    0.666876  3470 x 126995
                      2         2        1    0.678471  3393 x 119863
                      2         3        1    0.666245  3295 x 116069
                      3         1        1    0.671777  3470 x 125768
                      3         2        1    0.672868  3393 x 121161
                      3         3        1    0.658525  3295 x 116847
              Mean: 0.668292
              Std: 0.005945
------------------------------------------------------------
Trial 12
        Model
              [Pre@solution]_[Feat@svd100_and_bow_Jun27]_[Model@reg_xgb_linear]
        Param
              alpha: 0.385
              booster: gblinear
              eta: 0.34
              lambda: 1.4000000000000001
           

                      1         1        1    0.670817  3470 x 124790
                      1         2        1     0.66614  3393 x 125458
                      1         3        1    0.662563  3295 x 115835
                      2         1        1    0.664228  3470 x 126995
                      2         2        1    0.678625  3393 x 119863
                      2         3        1    0.666853  3295 x 116069
                      3         1        1    0.672401  3470 x 125768
                      3         2        1    0.673022  3393 x 121161
                      3         3        1    0.658525  3295 x 116847
              Mean: 0.668130
              Std: 0.005808
------------------------------------------------------------
Trial 19
        Model
              [Pre@solution]_[Feat@svd100_and_bow_Jun27]_[Model@reg_xgb_linear]
        Param
              alpha: 0.28500000000000003
              booster: gblinear
              eta: 0.05
              lambda: 1.5
            

KeyboardInterrupt: 