In [1]:
import lightgbm as lgb
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
from derive_functions.derive_date_var import derive_calender_feats
from derive_functions.data_prepare_func import create_dt,reduce_mem_usage
from derive_functions.derive_lag_mean_feats import create_lag_feats,create_lag_mean_feats
from derive_functions.derive_mean_feats_cat import get_df_cat
from derive_functions.derive_mean_feats_dept import get_df_dept
from derive_functions.derive_mean_feats_id import get_df_id
from derive_functions.derive_mean_feats_item import get_df_item
from derive_functions.derive_mean_feats_state import get_df_state
from derive_functions.derive_mean_feats_store import get_df_store
from derive_functions.derive_deviation_feats import get_deviation_feats
from sklearn.model_selection import GroupKFold
from sklearn import metrics

# 1.读取数据

In [2]:
data = pd.read_pickle('data/processed_data/df_train_var_all_0626_WI_subset.pkl')

# 2. 训练测试数据切分及预处理

In [3]:
data_train = data.copy(deep=True).reset_index(drop=True)
# data_test = data[data.date>'2015-12-20'].copy(deep=True).reset_index(drop=True)
del data
gc.collect()

0

In [4]:
store_feats=[]
for i in data_train.columns:
    if 'per_store' in i:
        store_feats.append(i)

In [4]:
useless_feats = ['d','date','wm_yr_wk','sales','weekday','revenue','id','state_id']
cat_feats = ['item_id', 'dept_id','cat_id','store_id'] + ["wday","event_name_1", "event_name_2", "event_type_1", "event_type_2"]
train_cols = data_train.columns[~data_train.columns.isin(useless_feats)]
X_train = data_train[train_cols]
y_train = data_train["sales"]


In [7]:
print(datetime.now())

2020-06-29 20:25:25.851888


# 3. 训练验证数据切分及训练

## 3.1 随机切分

In [7]:
np.random.seed(666)

fake_valid_inds = np.random.choice(X_train.index.values, 1000000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats,
                         free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                             free_raw_data=False)

In [13]:
%%time
# 新增加mode数据两列
params = {
        "objective" : "tweedie",
        'tweedie_variance_power': 1.1,
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.025,
        "subsample":0.5,
        "sub_feature" : 0.6,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 2**11-1,
    "min_data_in_leaf": 2**12-1,
}

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=50)

[50]	valid_0's rmse: 2.40657
[100]	valid_0's rmse: 2.20151
[150]	valid_0's rmse: 2.15869
[200]	valid_0's rmse: 2.13479
[250]	valid_0's rmse: 2.11779
[300]	valid_0's rmse: 2.10595
[350]	valid_0's rmse: 2.09719
[400]	valid_0's rmse: 2.08956
[450]	valid_0's rmse: 2.08277
[500]	valid_0's rmse: 2.07671
[550]	valid_0's rmse: 2.07117
[600]	valid_0's rmse: 2.06597
[650]	valid_0's rmse: 2.06133
[700]	valid_0's rmse: 2.05742
[750]	valid_0's rmse: 2.05314
[800]	valid_0's rmse: 2.04973
[850]	valid_0's rmse: 2.04689
[900]	valid_0's rmse: 2.04401
[950]	valid_0's rmse: 2.04085
[1000]	valid_0's rmse: 2.03823
[1050]	valid_0's rmse: 2.03526
[1100]	valid_0's rmse: 2.03293
[1150]	valid_0's rmse: 2.03049
[1200]	valid_0's rmse: 2.02843
Wall time: 22min 8s


In [14]:
m_lgb.save_model('model/random_split/model_tweedie_WI_var_all.pkl')

<lightgbm.basic.Booster at 0x24b1166fd08>

## 3.2 最后一个半月作为验证集

In [5]:
fake_valid_inds = data_train[data_train.date>'2016-04-07'].index.values
del data_train
gc.collect()

train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)

In [7]:
%%time
## 新加2个feats
params = {
        "objective" : "tweedie",
        'tweedie_variance_power': 1.1,
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.01,
        "subsample":0.5,
        "sub_feature" : 0.6,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'max_bin':100,
    'num_iterations' : 950,
    'num_leaves': 2**8-1,
    "min_data_in_leaf": 2**12-1,
}

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=50)
print(datetime.now())
m_lgb.save_model('model/time_split/model_tweedie_WI_var_all_v8.pkl')

[50]	valid_0's rmse: 3.30481
[100]	valid_0's rmse: 2.81684
[150]	valid_0's rmse: 2.54717
[200]	valid_0's rmse: 2.4109
[250]	valid_0's rmse: 2.34162
[300]	valid_0's rmse: 2.30299
[350]	valid_0's rmse: 2.27949
[400]	valid_0's rmse: 2.26391
[450]	valid_0's rmse: 2.25132
[500]	valid_0's rmse: 2.24206
[550]	valid_0's rmse: 2.23417
[600]	valid_0's rmse: 2.22913
[650]	valid_0's rmse: 2.22471
[700]	valid_0's rmse: 2.22126
[750]	valid_0's rmse: 2.21853
[800]	valid_0's rmse: 2.21631
[850]	valid_0's rmse: 2.21452
[900]	valid_0's rmse: 2.21265
[950]	valid_0's rmse: 2.21135
2020-06-30 21:59:01.231299
Wall time: 22min 1s


<lightgbm.basic.Booster at 0x1ac2dd4af48>