In [1]:
import time
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [2]:
X_train = pd.read_csv('../input/X_train.csv')
y_train = np.genfromtxt('../input/y_train.csv', delimiter=",")
X_val = pd.read_csv('../input/X_val.csv')
y_val = np.genfromtxt('../input/y_val.csv', delimiter=",")

In [21]:
# Use small portion of data for fast development
FAST_DEV = True
if FAST_DEV:
    DEV_PERCENT = 0.1
    print("Train with {}% data.".format(DEV_PERCENT*100))
    m_train = int(len(X_train) * DEV_PERCENT)
    X_train = X_train.iloc[:m_train, :]
    y_train = y_train[:m_train, :]
    # m_val = int(len(X_val) * DEV_PERCENT)
    # X_val = X_val.iloc[:m_val, :]
    # y_val = y_val[:m_val, :]
else:
    DEV_PERCENT = 1
    print("Train with {}% data.".format(DEV_PERCENT*100))

Train with 10.0% data.


In [22]:
print("Defining parameter space...")

params = {
    'boosting_type': ['gbdt'],
    'num_leaves': [2**8 - 1],
    'objective': ['regression_l2'],
    # 'max_depth': 8,
    'min_data_in_leaf': [50],
    'learning_rate': [0.05],
    'feature_fraction': [0.75],
    'bagging_fraction': [0.75],
    'bagging_freq': [1],
    'metric': ['l2'],
    'num_threads': [4],
    'num_boost_round': [1000],
#     'early_stopping_rounds': [50]
}

Defining parameter space...


In [23]:
cate_names = ['store_type', 'store_cluster', 'item_perishable', 'item_class']
cate_vars = [list(X_train.columns).index(i) for i in cate_names]
print("{} categorical features found in the training set. column #: {}".format(len(cate_vars), cate_vars))

4 categorical features found in the training set. column #: [70, 71, 72, 73]


In [24]:
lgb_reg = lgb.LGBMRegressor(
    random_state=42,
    categorical_feature=cate_vars,
    sample_weight=X_train['item_perishable'].values * 0.25 + 1,
    verbose=50,
#     eval_set=[(X_val, y_val)],
#     eval_sample_weight=[X_val['item_perishable'].values * 0.25 + 1],
#     eval_metric='regression_l2'
)

In [25]:
gbm = GridSearchCV(lgb_reg, params, cv=3)
gbm.fit(X_train, y_train[:, 0])
print(gbm.best_params_)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


{'objective': 'regression_l2', 'metric': 'l2', 'learning_rate': 0.05, 'bagging_freq': 1, 'feature_fraction': 0.75, 'min_data_in_leaf': 50, 'boosting_type': 'gbdt', 'num_threads': 4, 'bagging_fraction': 0.75, 'num_leaves': 255, 'num_boost_round': 1000}


In [26]:
gbm.best_score_

0.66488926104418766

In [27]:
gbm.best_estimator_.categorical_feature

[70, 71, 72, 73]

In [28]:
gbm.best_estimator_

LGBMRegressor(bagging_fraction=0.75, bagging_freq=1, boosting_type='gbdt',
       categorical_feature=[70, 71, 72, 73], class_weight=None,
       colsample_bytree=1.0, feature_fraction=0.75, learning_rate=0.05,
       max_depth=-1, metric='l2', min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=50, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_boost_round=1000, num_leaves=255,
       num_threads=4, objective='regression_l2', random_state=42,
       reg_alpha=0.0, reg_lambda=0.0,
       sample_weight=array([ 1.,  1., ...,  1.,  1.]), silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
       verbose=50)

In [29]:
print("\n".join(("%s: %.2f" % x) for x in sorted(
    zip(X_train.columns, gbm.best_estimator_.feature_importances_),
    key=lambda x: x[1], reverse=True)
))

item_class: 3571.00
mean_7: 3566.00
mean_3: 3349.00
mean_4_dow5_2017: 3302.00
mean_4_dow3_2017: 3285.00
mean_4_dow0_2017: 3097.00
mean_20_dow4_2017: 3092.00
mean_4_dow4_2017: 3029.00
mean_4_dow6_2017: 3015.00
mean_20_dow5_2017: 3013.00
mean_20_dow3_2017: 2960.00
mean_20_dow2_2017: 2942.00
store_cluster: 2934.00
mean_20_dow1_2017: 2923.00
mean_14: 2915.00
mean_4_dow2_2017: 2858.00
mean_4_dow1_2017: 2827.00
mean_20_dow6_2017: 2812.00
mean_20_dow0_2017: 2808.00
mean_70: 2664.00
mean_21: 2451.00
mean_35: 2418.00
mean_28: 2104.00
mean_140: 2023.00
days_since_last_salary: 1654.00
promo_140_sum: 1340.00
promo_70_sum: 784.00
holiday_28_sum: 547.00
holiday_35_sum: 547.00
holiday_70_sum: 436.00
holiday_21_sum: 345.00
promo_35_sum: 342.00
promo_28_sum: 221.00
promo_0: 200.00
store_type: 195.00
promo_21_sum: 187.00
holiday_14_sum: 160.00
promo_14_sum: 137.00
holiday_140_sum: 111.00
promo_7: 111.00
promo_7_sum: 100.00
item_perishable: 63.00
promo_9: 54.00
promo_14: 42.00
promo_3_sum: 37.00
promo_2:

In [30]:
gbm.best_estimator_.feature_importances_

array([ 111,  160,  345,  547,  547,    0,  436,    5, 2915, 2023, 2451,
       2104, 3349, 2418, 3566, 2664, 1340,  137,  187,  221,  342,   37,
        784,  100, 3097, 2808, 2827, 2923, 2858, 2942, 3285, 2960, 3029,
       3092, 3302, 3013, 3015, 2812,  200,    0,   13,    0,   36,    0,
         23,    0,   19,    0,    5,    0,   11,    0,  111,    0,   22,
          0,   54,    0,   18,    0,   26,    0,   11,    0,   28,    0,
         42,    0,   30,    0,  195, 2934,   63, 3571, 1654])