# LightGBM

# Feature Engineering

In [3]:
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error,r2_score
import time

import lightgbm as lgb
from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings("ignore")

In [4]:
"""
Contributions from:
DSEverything - Mean Mix - Math, Geo, Harmonic (LB 0.493) 
https://www.kaggle.com/dongxu027/mean-mix-math-geo-harmonic-lb-0-493
JdPaletto - Surprised Yet? - Part2 - (LB: 0.503)
https://www.kaggle.com/jdpaletto/surprised-yet-part2-lb-0-503
hklee - weighted mean comparisons, LB 0.497, 1ST
https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st

Also all comments for changes, encouragement, and forked scripts rock

Keep the Surprise Going
"""

import glob, re
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime

data = {
    'tra': pd.read_csv('../../../mltestdata/05_recruit/air_visit_data.csv'),
    'as': pd.read_csv('../../../mltestdata/05_recruit/air_store_info.csv'),
    'hs': pd.read_csv('../../../mltestdata/05_recruit/hpg_store_info.csv'),
    'ar': pd.read_csv('../../../mltestdata/05_recruit/air_reserve.csv'),
    'hr': pd.read_csv('../../../mltestdata/05_recruit/hpg_reserve.csv'),
    'id': pd.read_csv('../../../mltestdata/05_recruit/store_id_relation.csv'),
    'tes': pd.read_csv('../../../mltestdata/05_recruit/sample_submission.csv'),
    'hol': pd.read_csv('../../../mltestdata/05_recruit/date_info.csv').rename(columns={'calendar_date':'visit_date'})
    }

data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

for df in ['ar','hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])

data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

#sure it can be compressed...
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores = pd.merge(stores, data['as'], how='left', on=['air_store_id']) 
# NEW FEATURES FROM Georgii Vyshnia
stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/',' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))
lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date']) 
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date']) 

train = pd.merge(train, stores, how='left', on=['air_store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)

train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

# NEW FEATURES FROM JMBULL
train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']

# NEW FEATURES FROM Georgii Vyshnia
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2'] = lbl.transform(test['air_store_id'])

col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

In [5]:
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(train.columns, train.dtypes):	
    if dtype == np.float64:		
        train[c] = train[c].astype(np.float32)

for c, dtype in zip(test.columns, test.dtypes):	
    if dtype == np.float64:		
        test[c] = test[c].astype(np.float32)


Processing data for LightGBM ...


In [6]:
drop_cols=['visitors','air_store_id','visit_date','id']
y_train=train['visitors']
x_train=train.drop(drop_cols, axis=1)

x_test=test.copy()
x_test=x_test.drop(drop_cols, axis=1)

# Modeling

In [7]:
y = train.visitors
train_input = train.copy()
test_input = test.copy()

drop_cols=['visitors','air_store_id','visit_date','id']
train_input=train_input.drop(drop_cols, axis=1)
test_input=test_input.drop(drop_cols, axis=1)

In [8]:
localtrain, localval = train_test_split(train,test_size=0.3,random_state=2018)

y_localtrain=localtrain['visitors']
x_localtrain=localtrain.drop(drop_cols, axis=1)

y_localval=localval['visitors']
x_localval=localval.drop(drop_cols, axis=1)

## LightGBM

In [101]:
#Define a evaluation function

def rmsle(preds, true):
    rmsle = np.sqrt(mean_squared_error(np.log1p(true), np.log1p(preds)))
    return float(rmsle)

In [102]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer
RMSLE = make_scorer(rmsle)

In [103]:
# Define a function for comparing predictions and true data.
def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

# Cross validation with LightGBM

In [1]:
def cross_validate_lgb_yifan(params, x_train, y_train, #x_test, 
                             kf, cat_features=[],
                             verbose=True, verbose_eval=100, nseeds=1, df_input=True,
                             early_stopping=100, num_boost_round=8000):
    start_time = time.time()
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))

    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        preds = np.expm1(preds)
        true = np.expm1(train_data.get_label())
        #return 'rmsle', rmsle(true, preds), False

        return 'rmsle', rmsle(preds, true), False

        
    if len(cat_features)==0: use_cat=False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5
        if df_input:
            x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        else:
            x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]

        y_train_kf, y_val_kf = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])

        #print("y_train_kf     "+str(y_train_kf.values[0:5]))
        #print("y_val_kf     "+str(y_val_kf.values[0:5]))

        for seed in range(nseeds):
            params['feature_fraction_seed'] = seed
            params['bagging_seed'] = seed

            if use_cat:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf, categorical_feature=cat_features)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train, categorical_feature=cat_features)

            else:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train)

            gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=num_boost_round,
                            valid_sets=[lgb_val],
                            early_stopping_rounds=early_stopping,
                            feval=feval_rmsle,
                            verbose_eval=verbose_eval)

            #print("gbm.best_iteration     "+str(gbm.best_iteration))
            val_pred = np.expm1(gbm.predict(x_val_kf, num_iteration=gbm.best_iteration))
      
            #print("gbm.predict(x_val_kf, num_iteration=gbm.best_iteration)     "+str(gbm.predict(x_val_kf, num_iteration=gbm.best_iteration)[0:5]))
            #print("gbm.predict(x_val_kf)     "+str(gbm.predict(x_val_kf)[0:5]))
            #print("gbm.predict(x_val_kf) exp"+str(np.expm1(gbm.predict(x_val_kf)[0:5])))
            
            train_pred[val_index] += val_pred
            test_pred += np.expm1((gbm.predict(x_test, num_iteration=gbm.best_iteration)))


        train_pred[val_index] = val_pred/nseeds

        #fold_rmsle = rmsle(np.expm1(y_val_kf.values), train_pred[val_index])
        fold_rmsle = rmsle(train_pred[val_index],np.expm1(y_val_kf.values))
        if verbose:
            print('fold cv {} RMSLE score is {:.6f}\n'.format(i, fold_rmsle))


    test_pred = test_pred / (nseeds * kf.n_splits)
    #cv_score = rmsle(y_train, train_pred)
    cv_score = rmsle(train_pred, y_train)
    
    if verbose:
        print('cv RMSLE score is {:.6f}'.format(cv_score))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
    #return cv_score, np.expm1(train_pred),test_pred
    return cv_score

In [128]:
def cross_validate_lgb_nofeval(params, x_train, y_train, kf, verbose=True, verbose_eval=50,df_input=True):

    start_time = time.time()
    train_pred = np.zeros((x_train.shape[0]))

    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        #preds = np.expm1(preds)
        #true = np.expm1(train_data.get_label())
        true = train_data.get_label()
        return 'my rmsle', rmsle(preds,true), False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5

        if df_input:
            x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        else:
            x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]
            
        #y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
        y_train_kf_log, y_val_kf_log = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])

        lgb_train = lgb.Dataset(x_train_kf, y_train_kf_log)
        lgb_val = lgb.Dataset(x_val_kf, y_val_kf_log, reference=lgb_train)
        
        watchlist= [lgb_train, lgb_val]

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=8000,
                        #valid_sets=lgb_val,
                        valid_sets=watchlist, 
                        early_stopping_rounds=100,
                        verbose_eval=verbose_eval,
                        #feval=feval_rmsle
                       )

        val_pred_log = gbm.predict(x_val_kf, num_iteration=gbm.best_iteration)

        train_pred[val_index] += val_pred_log
        #fold_rmsle = rmsle(val_pred, y_val_kf.values)
        print("val_pred_log  "+str(val_pred_log[0:5]))
        print("val_pred_log  exp"+str(np.expm1(val_pred_log[0:5])))
        print("y_val_kf_log  "+str(y_val_kf_log.values[0:5]))
        print("y_val_kf_log  exp"+str(np.expm1(y_val_kf_log.values[0:5])))
        
        fold_rmsle = rmsle(np.expm1(val_pred_log), np.expm1(y_val_kf_log.values))

        if verbose:
            print('fold cv {} RMSLE score is {:.6f}\n'.format(i, fold_rmsle))
 
    cv_rmsle = rmsle(np.expm1(train_pred), y_train)

    if verbose:
        print('cv RMSLE score is {:.6f}'.format(cv_rmsle))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))

    return cv_rmsle
#    return cv_rmsle, train_pred


In [78]:
def cross_validate_lgb(params, x_train, y_train, kf, verbose=True, verbose_eval=50,df_input=True):

    start_time = time.time()
    train_pred = np.zeros((x_train.shape[0]))

    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        #preds = np.expm1(preds)
        #true = np.expm1(train_data.get_label())
        true = train_data.get_label()
        return 'my rmsle', rmsle(preds,true), False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5

        if df_input:
            x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        else:
            x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]
            
        y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
        #y_train_kf, y_val_kf = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])

        lgb_train = lgb.Dataset(x_train_kf, y_train_kf)
        lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train)
        
        watchlist= [lgb_train, lgb_val]

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=8000,
                        #valid_sets=lgb_val,
                        valid_sets=watchlist, 
                        early_stopping_rounds=100,
                        verbose_eval=verbose_eval,
                        feval=feval_rmsle
                       )

        val_pred = gbm.predict(x_val_kf, num_iteration=gbm.best_iteration)

        train_pred[val_index] += val_pred
        fold_rmsle = rmsle(val_pred, y_val_kf.values)

        if verbose:
            print('fold cv {} RMSLE score is {:.6f}\n'.format(i, fold_rmsle))
 
    cv_rmsle = rmsle(train_pred, y_train)

    if verbose:
        print('cv RMSLE score is {:.6f}'.format(cv_rmsle))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))

    return cv_rmsle
#    return cv_rmsle, train_pred


In [136]:
lgb_params = {
    'boosting_type': 'dart',
    'max_depth' : 5,
    'max_bin' : 500,
    'learning_rate': 0.1,  # 0.618580
    'num_leaves': 22,
    #'metric': 'RMSE'
}

# only do 5 fold CV here so that we save some running time on Kaggle Kernel
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)
#kf=KFold(n_splits=3, shuffle=True, random_state=2018)

print('Start training...')

#cv_score =cross_validate_lgb(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
#cv_score =cross_validate_lgb_nofeval(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
cv_score =cross_validate_lgb_yifan(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)

print('cv score is {:.6f}'.format(cv_score))

Start training...
y_train_kf     [ 3.25809654  3.49650756  3.40119738  3.13549422  3.4657359 ]
y_val_kf     [ 1.94591015  2.30258509  3.09104245  2.94443898  2.48490665]
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.57728
[100]	valid_0's rmsle: 0.583222
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.511092
gbm.best_iteration     45
gbm.predict(x_val_kf, num_iteration=gbm.best_iteration)     [ 0.98279632  1.16748234  1.18477145  1.43830156  1.16748234]
gbm.predict(x_val_kf)     [ 0.98279632  1.16748234  1.18477145  1.43830156  1.16748234]
gbm.predict(x_val_kf) exp[ 1.67191734  2.21389095  2.26993939  3.21353329  2.21389095]
fold cv 0 RMSLE score is 1.779739

y_train_kf     [ 3.13549422  1.94591015  2.30258509  3.09104245  2.94443898]
y_val_kf     [ 3.25809654  3.49650756  3.40119738  3.4657359   3.29583687]
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.582775
[100]	valid_0's rmsle: 0.588823
Ear

# Bayesian Optimsation - Setup

In [137]:
params={
    'num_leaves':(7,100),#(7,4095),
    'max_depth':(2,63),
    'learning_rate':(0.05,0.3),
    #'scale_pos_weight':(1,10000),
    'min_sum_hessian_in_leaf':(2,30),
    'subsample':(0.4,1.0),
    'colsample_bytree':(0.4,1.0),
    #'feature_fraction':(0.0,1.0),
    #'bagging_fraction':(0.0,1.0),
    #'bagging_freq':(0,2),
    #'lambda_l1':(0.0,1.0),
    #'lambda_l2':(0.0,1.0),
    #'n_estimators':(2,30), 
    #'reg_lambda':(0.0,2.0),
    #'min_gain_to_split':(0.0,1.0)
}

In [138]:
# reload(lgb_wrapper)
#def lgbcv_func(max_depth, learning_rate, subsample, colsample_bytree, nthread=4, seed=0):
def lgbcv_func(num_leaves, max_depth, learning_rate,
               #scale_pos_weight, 
               min_sum_hessian_in_leaf, 
               subsample, 
               colsample_bytree,
               #feature_fraction, bagging_fraction, 
               #bagging_freq, lambda_l1, lambda_l2,
               #n_estimators,reg_lambda,min_gain_to_split,
               nthread=4):

    params = {
        'objective' : "regression",
        'task': 'train',
        'boosting_type': 'dart',
                
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth), 
        'learning_rate': float(learning_rate),
        #'scale_pos_weight':scale_pos_weight,
        'min_sum_hessian_in_leaf':float(min_sum_hessian_in_leaf), 
        'subsample':subsample,
        'colsample_bytree':colsample_bytree,
        #'feature_fraction':feature_fraction, 
        #'bagging_fraction':bagging_fraction,
        #'bagging_freq':bagging_freq, 
        #'lambda_l1':lambda_l1, 
        #'lambda_l2':lambda_l2,
        #'n_estimators':n_estimators,
        #'reg_lambda':reg_lambda,
        #'min_gain_to_split':min_gain_to_split       
        #'metric': 'RMSE'
    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_lgb(params, train_input, y, kf, verbose=False, verbose_eval=False)

In [139]:
#train_input.isnull().any()

In [140]:
lgb_bo=BayesianOptimization(lgbcv_func, params)

In [141]:
lgb_bo.maximize(init_points=5, n_iter=20)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_sum_hessian_in_leaf |   num_leaves |   subsample | 
    1 | 01m16s | [35m   0.41558[0m | [32m            0.8497[0m | [32m         0.1160[0m | [32m    14.4971[0m | [32m                  24.3924[0m | [32m     77.2112[0m | [32m     0.8666[0m | 
    2 | 01m13s | [35m   0.43342[0m | [32m            0.9888[0m | [32m         0.1612[0m | [32m    30.1503[0m | [32m                   7.2217[0m | [32m     79.5667[0m | [32m     0.8434[0m | 


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [1]:
lgb_bo.res['max']['max_params']

NameError: name 'lgb_bo' is not defined

In [None]:
'''

{'bagging_fraction': 0.17995806067430897,
 'bagging_freq': 1.3430342823014347,
 'feature_fraction': 0.51232300441712064,
 'lambda_l1': 0.54056564353676506,
 'max_depth': 20.348482273206898,
 'min_sum_hessian_in_leaf': 29.260145258218444,
 'num_leaves': 15.619747584846419}
 '''

In [21]:
print('-'*30)
print('Final Results')
print('Maximum value: %f' % lgb_bo.res['max']['max_val'])
print('Best parameters: ', lgb_bo.res['max']['max_params'])

------------------------------
Final Results
Maximum value: 0.422800
Best parameters:  {'num_leaves': 15.619747584846419, 'min_sum_hessian_in_leaf': 29.260145258218444, 'max_depth': 20.348482273206898, 'feature_fraction': 0.51232300441712064, 'bagging_fraction': 0.17995806067430897, 'bagging_freq': 1.3430342823014347, 'lambda_l1': 0.54056564353676506}


In [None]:
'''
------------------------------
Final Results
Maximum value: 0.422800
Best parameters:  {'num_leaves': 15.619747584846419, 'min_sum_hessian_in_leaf': 29.260145258218444, 'max_depth': 20.348482273206898, 'feature_fraction': 0.51232300441712064, 'bagging_fraction': 0.17995806067430897, 'bagging_freq': 1.3430342823014347, 'lambda_l1': 0.54056564353676506}
'''

# Submission

In [34]:
lgb_params = {
    'objective' : "regression",
    'learning_rate': 0.1,       
    'task': 'train',
    'boosting_type': 'dart',
    'num_leaves': 16,#15.619747584846419, 
    'min_sum_hessian_in_leaf': 29.260145258218444, 
    'max_depth': 20,#20.348482273206898,
    'feature_fraction': 0.51232300441712064, 
    'bagging_fraction': 0.17995806067430897, 
    'bagging_freq': 1, #1.3430342823014347, 
    'lambda_l1': 0.54056564353676506
}

In [36]:
# only do 3 fold CV here so that we save some running time on Kaggle Kernel
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)

print('Start training...')

lgb_cv_score, lgb_train_pred =cross_validate_lgb(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)

print('cv score is {:.6f}'.format(cv_score))

Start training...
Training until validation scores don't improve for 30 rounds.
[50]	training's rmsle: 0.580172	valid_1's rmsle: 0.579043
Early stopping, best iteration is:
[45]	training's rmsle: 0.511956	valid_1's rmsle: 0.512371
fold cv 0 RMSLE score is 0.577125

Training until validation scores don't improve for 30 rounds.
[50]	training's rmsle: 0.578437	valid_1's rmsle: 0.584288
Early stopping, best iteration is:
[45]	training's rmsle: 0.510283	valid_1's rmsle: 0.516068
fold cv 1 RMSLE score is 0.575701

Training until validation scores don't improve for 30 rounds.
[50]	training's rmsle: 0.580905	valid_1's rmsle: 0.578431
Early stopping, best iteration is:
[45]	training's rmsle: 0.513352	valid_1's rmsle: 0.509707
fold cv 2 RMSLE score is 0.573614

cv RMSLE score is 0.575483
it takes 13.225 seconds to perform cross validation
cv score is 0.575483


In [37]:
lgb_train_pred

array([ 6.69600288,  5.47287828,  8.57551433, ...,  1.95046062,
        2.10087429,  2.00777837])

In [None]:
gbm = lgb.train(lgb_params,
                ,
                num_boost_round=4000,
                valid_sets=lgb_val,
                early_stopping_rounds=30,
                verbose_eval=verbose_eval)


lgb_model = lgb.train(lgb_params, train_set=d_train, num_boost_round=rounds, 
                          valid_sets=watchlist, verbose_eval=1000, early_stopping_rounds = 300)
test_pred = lgb_model.predict(X_v)

params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
    early_stopping_rounds=500, verbose_eval=500

    
xgr_g = xgb.XGBRegressor(**grid_xgb.best_params_)
xgr_g.fit(X_train, y_train)
y_pred_gs = xgr_g.predict(X_test)