In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import timedelta, date
import lightgbm as lgb

In [2]:
DATA_PATH = 'data/sample/'

In [4]:
X_train = pd.read_csv(DATA_PATH + 'X_train_2017.csv')
y_train = np.genfromtxt(DATA_PATH + 'y_train_2017.csv', delimiter=",")
X_val = pd.read_csv(DATA_PATH + 'X_val_2017.csv')
y_val = np.genfromtxt(DATA_PATH + 'y_val_2017.csv', delimiter=",")
X_test = pd.read_csv(DATA_PATH + 'X_test.csv')

In [7]:
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []

In [8]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=X_train['item_perishable'] * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=X_val['item_perishable'] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))


Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.359732	valid_1's l2: 0.371342
[100]	training's l2: 0.345239	valid_1's l2: 0.356706
[150]	training's l2: 0.341331	valid_1's l2: 0.35434
[200]	training's l2: 0.339159	valid_1's l2: 0.353216
[250]	training's l2: 0.337521	valid_1's l2: 0.352411
[300]	training's l2: 0.336128	valid_1's l2: 0.351869
[350]	training's l2: 0.334793	valid_1's l2: 0.351251
[400]	training's l2: 0.333674	valid_1's l2: 0.350817
[450]	training's l2: 0.332653	valid_1's l2: 0.350626
[500]	training's l2: 0.331732	valid_1's l2: 0.350471
[550]	training's l2: 0.330833	valid_1's l2: 0.350243
[600]	training's l2: 0.330033	valid_1's l2: 0.350184
[650]	training's l2: 0.329269	valid_1's l2: 0.35003
[700]	training's l2: 0.328527	valid_1's l2: 0.349916
[750]	training's l2: 0.327821	valid_1's l2: 0.349767
[800]	training's l2: 0.327136	valid_1's l2: 0.349683
[850]	training's l2: 0.326481	valid_1's l2: 0.349579
[900]	training's l2: 0.325873	valid_1's 