In [None]:
def cross_validate_lgb(params, x_train, y_train, x_test, kf, cat_features=[],
                       verbose=True, verbose_eval=100, nseeds=1, df_input=True,
                       early_stopping=100, num_boost_round=8000):
    start_time = time.time()
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))

    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        preds = np.expm1(preds)
        true = np.expm1(train_data.get_label())
        return 'rmsle', rmsle(true, preds), False

    if len(cat_features)==0: use_cat=False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5
        if df_input:
            x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        else:
            x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]

        y_train_kf, y_val_kf = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])

        for seed in range(nseeds):
            params['feature_fraction_seed'] = seed
            params['bagging_seed'] = seed

            if use_cat:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf, categorical_feature=cat_features)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train, categorical_feature=cat_features)

            else:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train)

            gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=num_boost_round,
                            valid_sets=[lgb_val]
                            early_stopping_rounds=early_stopping,
                            feval=feval_rmsle,
                            verbose_eval=verbose_eval)

            val_pred = np.expm1(gbm.predict(x_val_kf, num_iteration=gbm.best_iteration))

            train_pred[val_index] += val_pred
            test_pred += np.expm1((gbm.predict(x_test, num_iteration=gbm.best_iteration)))


        train_pred[val_index] = val_pred/nseeds

        fold_rmsle = rmsle(np.expm1(y_val_kf.values), train_pred[val_index])
        if verbose:
            print('fold cv {} RMSLE score is {:.6f}'.format(i, fold_rmsle))


    test_pred = test_pred / (nseeds * kf.n_splits)
    cv_score = rmsle(y_train, train_pred)
    if verbose:
        print('cv RMSLE score is {:.6f}'.format(cv_score))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
    return cv_score, np.expm1(train_pred),test_pred