In [1]:
import lightgbm as lgb
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
from derive_functions.derive_date_var import derive_calender_feats
from derive_functions.data_prepare_func import create_dt,reduce_mem_usage
from derive_functions.derive_lag_mean_feats import create_lag_feats,create_lag_mean_feats
from derive_functions.derive_mean_feats_cat import get_df_cat
from derive_functions.derive_mean_feats_dept import get_df_dept
from derive_functions.derive_mean_feats_id import get_df_id
from derive_functions.derive_mean_feats_item import get_df_item
from derive_functions.derive_mean_feats_state import get_df_state
from derive_functions.derive_mean_feats_store import get_df_store
from derive_functions.derive_deviation_feats import get_deviation_feats
from sklearn.model_selection import GroupKFold
from sklearn import metrics

# 1.读取数据

In [2]:
data = pd.read_pickle('data/processed_data/df_train_var_all_0626_CA_subset.pkl')

# 2. 训练测试数据切分及预处理

In [3]:
data_train = data.copy(deep=True).reset_index(drop=True)
# data_train = data[data.date<='2016-04-24'].copy(deep=True).reset_index(drop=True)
# data_test = data[data.date>'2015-12-20'].copy(deep=True).reset_index(drop=True)
del data
gc.collect()

0

In [4]:
data_train.date.min()

Timestamp('2014-09-06 00:00:00')

In [5]:
data_train.date.max()

Timestamp('2016-05-22 00:00:00')

In [4]:
store_feats=[]
for i in data_train.columns:
    if 'per_store' in i:
        store_feats.append(i)

In [4]:
useless_feats = ['d','date','wm_yr_wk','sales','weekday','revenue','id','state_id']
cat_feats = ['item_id', 'dept_id','cat_id','store_id'] + ["wday","event_name_1", "event_name_2", "event_type_1", "event_type_2"]
train_cols = data_train.columns[~data_train.columns.isin(useless_feats)]
X_train = data_train[train_cols]
y_train = data_train["sales"]


# 3. 训练验证数据切分及预处理

## 3.1 随机切分

In [5]:
np.random.seed(111)
del data_train
gc.collect()
fake_valid_inds = np.random.choice(X_train.index.values, 1000000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)

In [6]:
%%time
## 新加2个feats
params = {
        "objective" : "tweedie",
        'tweedie_variance_power': 1.1,
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.025,
        "subsample":0.5,
        "sub_feature" : 0.5,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 2**10-1,
    "min_data_in_leaf": 2**12-1,
}
#lr:0.03
#1400
#subsample:0.525
m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=50)
m_lgb.save_model('model/random_split/model_tweedie_CA_var_all_new_v2.pkl')

[50]	valid_0's rmse: 2.57617
[100]	valid_0's rmse: 2.32373
[150]	valid_0's rmse: 2.25534
[200]	valid_0's rmse: 2.21923
[250]	valid_0's rmse: 2.19847
[300]	valid_0's rmse: 2.18567
[350]	valid_0's rmse: 2.17704
[400]	valid_0's rmse: 2.16825
[450]	valid_0's rmse: 2.16064
[500]	valid_0's rmse: 2.15388
[550]	valid_0's rmse: 2.14781
[600]	valid_0's rmse: 2.14183
[650]	valid_0's rmse: 2.13618
[700]	valid_0's rmse: 2.13148
[750]	valid_0's rmse: 2.12759
[800]	valid_0's rmse: 2.12351
[850]	valid_0's rmse: 2.11966
[900]	valid_0's rmse: 2.11629
[950]	valid_0's rmse: 2.11305
[1000]	valid_0's rmse: 2.10986
[1050]	valid_0's rmse: 2.10737
[1100]	valid_0's rmse: 2.10489
[1150]	valid_0's rmse: 2.10239
[1200]	valid_0's rmse: 2.10022
Wall time: 22min 23s


<lightgbm.basic.Booster at 0x248832001c8>

## 3.2 最后1个半月的数据作为验证集

In [5]:
fake_valid_inds = data_train[data_train.date>'2016-04-07'].index.values
del data_train
gc.collect()

train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)

In [8]:
%%time
## 新加2个feats
params = {
        "objective" : "tweedie",
        'tweedie_variance_power': 1.1,
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.01,
        "subsample":0.5,
        "sub_feature" : 0.6,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'max_bin':100,
    'num_iterations' : 900,
    'num_leaves': 2**8-1,
    "min_data_in_leaf": 2**12-1,
}

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=50)
print(datetime.now())
m_lgb.save_model('model/time_split/model_tweedie_CA_var_all_v8.pkl')

[50]	valid_0's rmse: 2.97471
[100]	valid_0's rmse: 2.51596
[150]	valid_0's rmse: 2.27421
[200]	valid_0's rmse: 2.16312
[250]	valid_0's rmse: 2.11132
[300]	valid_0's rmse: 2.08274
[350]	valid_0's rmse: 2.06243
[400]	valid_0's rmse: 2.0466
[450]	valid_0's rmse: 2.0341
[500]	valid_0's rmse: 2.02388
[550]	valid_0's rmse: 2.01695
[600]	valid_0's rmse: 2.01292
[650]	valid_0's rmse: 2.00956
[700]	valid_0's rmse: 2.00644
[750]	valid_0's rmse: 2.00463
[800]	valid_0's rmse: 2.00328
[850]	valid_0's rmse: 2.00176
[900]	valid_0's rmse: 2.00083
2020-06-30 21:20:56.552478
Wall time: 15min 9s


<lightgbm.basic.Booster at 0x23758b45e48>