In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
# six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
# six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    ic(name)
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)]=(matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0

matrix['delta_price_lag']=matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

cache = {}
matrix['item_shop_last_sale'] = -1
matrix['item_shop_last_sale'] = matrix['item_shop_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        matrix.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num    

# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

ic| name: '!Якутск Орджоникидзе, 56 фран'
ic| name: '!Якутск ТЦ "Центральный" фран'
ic| name: 'Адыгея ТЦ "Мега"'
ic| name: 'Балашиха ТРК "Октябрь-Киномир"'
ic| name: 'Волжский ТЦ "Волга Молл"'
ic| name: 'Вологда ТРЦ "Мармелад"'
ic| name: 'Воронеж (Плехановская, 13)'
ic| name: 'Воронеж ТРЦ "Максимир"'
ic| name: 'Воронеж ТРЦ Сити-Парк "Град"'
ic| name: 'Выездная Торговля'
ic| name: 'Жуковский ул. Чкалова 39м?'
ic| name: 'Жуковский ул. Чкалова 39м²'
ic| name: 'Интернет-магазин ЧС'
ic| name: 'Казань ТЦ "Бехетле"'
ic| name: 'Казань ТЦ "ПаркХаус" II'
ic| name: 'Калуга ТРЦ "XXI век"'
ic| name: 'Коломна ТЦ "Рио"'
ic| name: 'Красноярск ТЦ "Взлетка Плаза"'
ic| name: 'Красноярск ТЦ "Июнь"'
ic| name: 'Курск ТЦ "Пушкинский"'
ic| name: 'Москва "Распродажа"'
ic| name: 'Москва МТРЦ "Афи Молл"'
ic| name: 'Москва Магазин С21'
ic| name: 'Москва ТК "Буденовский" (пав.А2)'
ic| name: 'Москва ТК "Буденовский" (пав.К7)'
ic| name: 'Москва ТРК "Атриум"'
ic| name: 'Москва ТЦ "Ареал" (Беляево)'
ic| name: 'Москва 

In [2]:
matrix.shape

(6639294, 65)

In [None]:
matrix.shape

In [7]:
# 降低偏度
from scipy.special import boxcox1p, boxcox
from scipy import stats
columns = []
for col in matrix.columns:
    if col != 'item_cnt_month':
        if stats.skew(matrix[col]) > 0.75:
            columns.append(col)

for i in columns:
    print(i, '>> ', stats.skew(matrix[i]))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


cat_subtype_code >>  0.8529682866274095
item_cnt_month_lag_1 >>  9.40625
item_cnt_month_lag_2 >>  9.375
item_cnt_month_lag_3 >>  9.3828125
item_cnt_month_lag_6 >>  9.4921875
item_cnt_month_lag_12 >>  10.203125
date_item_avg_item_cnt_lag_1 >>  9.8125
date_item_avg_item_cnt_lag_2 >>  9.8125
date_item_avg_item_cnt_lag_3 >>  9.7890625
date_item_avg_item_cnt_lag_6 >>  9.7890625
date_item_avg_item_cnt_lag_12 >>  10.1875
date_shop_avg_item_cnt_lag_1 >>  2.05859375
date_shop_avg_item_cnt_lag_2 >>  2.044921875
date_shop_avg_item_cnt_lag_3 >>  2.046875
date_shop_avg_item_cnt_lag_6 >>  2.048828125
date_shop_avg_item_cnt_lag_12 >>  2.201171875
date_cat_avg_item_cnt_lag_1 >>  18.171875
date_cat_avg_item_cnt_lag_2 >>  18.296875
date_cat_avg_item_cnt_lag_3 >>  18.390625
date_cat_avg_item_cnt_lag_6 >>  18.734375
date_cat_avg_item_cnt_lag_12 >>  20.1875
date_cat_shop_avg_item_cnt_lag_1 >>  14.3046875
date_cat_shop_avg_item_cnt_lag_2 >>  14.4140625
date_cat_shop_avg_item_cnt_lag_3 >>  14.640625
date_cat

In [8]:
for i in columns:
    matrix[i]=boxcox1p(matrix[i], 0.15)

for i in columns:
    print(i, '>> ', stats.skew(matrix[i]))

cat_subtype_code >>  0.21308243656317116
item_cnt_month_lag_1 >>  3.831051826477051
item_cnt_month_lag_2 >>  3.8560307025909424
item_cnt_month_lag_3 >>  3.8862102031707764
item_cnt_month_lag_6 >>  4.013579368591309
item_cnt_month_lag_12 >>  4.534460067749023
date_item_avg_item_cnt_lag_1 >>  3.9658572673797607
date_item_avg_item_cnt_lag_2 >>  3.969264030456543
date_item_avg_item_cnt_lag_3 >>  3.9657185077667236
date_item_avg_item_cnt_lag_6 >>  4.00478458404541
date_item_avg_item_cnt_lag_12 >>  4.375425338745117
date_shop_avg_item_cnt_lag_1 >>  1.4135289192199707
date_shop_avg_item_cnt_lag_2 >>  1.397884726524353
date_shop_avg_item_cnt_lag_3 >>  1.4012951850891113
date_shop_avg_item_cnt_lag_6 >>  1.4246762990951538
date_shop_avg_item_cnt_lag_12 >>  1.6625432968139648
date_cat_avg_item_cnt_lag_1 >>  3.311107873916626
date_cat_avg_item_cnt_lag_2 >>  3.30843448638916
date_cat_avg_item_cnt_lag_3 >>  3.2901079654693604
date_cat_avg_item_cnt_lag_6 >>  3.340208053588867
date_cat_avg_item_cnt_la

In [13]:
"""建模"""

trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

In [14]:
from sklearn.preprocessing import RobustScaler, StandardScaler
rs = RobustScaler()
matrix_scaler = matrix.drop('item_cnt_month', axis=1)
rs.fit(matrix_scaler)
X_train_transform = rs.transform(X_train)
X_valid_transform = rs.transform(X_valid)

In [17]:
"""建模"""

import lightgbm as lgb
train_data = lgb.Dataset(data=X_train_transform, label=label_train)
valid_data = lgb.Dataset(data=X_valid_transform, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 8,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10861
[LightGBM] [Info] Number of data points in the train set: 6186922, number of used features: 64
[LightGBM] [Info] Start training from score 0.288852
[1]	training's rmse: 1.18357	valid_1's rmse: 1.1333
[2]	training's rmse: 1.1793	valid_1's rmse: 1.12993
[3]	training's rmse: 1.17597	valid_1's rmse: 1.1278
[4]	training's rmse: 1.17208	valid_1's rmse: 1.12466
[5]	training's rmse: 1.1682	valid_1's rmse: 1.12183
[6]	training's rmse: 1.16424	valid_1's rmse: 1.11894
[7]	training's rmse: 1.15954	valid_1's rmse: 1.11557
[8]	training's rmse: 1.15593	valid_1's rmse: 1.11269
[9]	training's rmse: 1.15192	valid_1's rmse: 1.10994
[10]	training's rmse: 1.14713	valid_1's rmse: 1.10636
[11]	training's rmse: 1.14298	valid_1's rmse: 1.10348
[12]	training's rmse: 1.13878	valid_1's rmse: 1.10062
[13]	training's rmse: 1.13445	valid_1's rmse: 1.09744
[14]	t

[139]	training's rmse: 0.893785	valid_1's rmse: 0.937581
[140]	training's rmse: 0.893027	valid_1's rmse: 0.937165
[141]	training's rmse: 0.892211	valid_1's rmse: 0.936779
[142]	training's rmse: 0.891013	valid_1's rmse: 0.935923
[143]	training's rmse: 0.89049	valid_1's rmse: 0.935597
[144]	training's rmse: 0.889773	valid_1's rmse: 0.935225
[145]	training's rmse: 0.888802	valid_1's rmse: 0.934624
[146]	training's rmse: 0.887915	valid_1's rmse: 0.934052
[147]	training's rmse: 0.88696	valid_1's rmse: 0.933359
[148]	training's rmse: 0.88609	valid_1's rmse: 0.932903
[149]	training's rmse: 0.885426	valid_1's rmse: 0.932578
[150]	training's rmse: 0.884624	valid_1's rmse: 0.932138
[151]	training's rmse: 0.883781	valid_1's rmse: 0.93168
[152]	training's rmse: 0.883013	valid_1's rmse: 0.931176
[153]	training's rmse: 0.882242	valid_1's rmse: 0.930748
[154]	training's rmse: 0.881396	valid_1's rmse: 0.930254
[155]	training's rmse: 0.880968	valid_1's rmse: 0.929991
[156]	training's rmse: 0.880222	val

[276]	training's rmse: 0.832795	valid_1's rmse: 0.910461
[277]	training's rmse: 0.832631	valid_1's rmse: 0.910414
[278]	training's rmse: 0.832321	valid_1's rmse: 0.910347
[279]	training's rmse: 0.832134	valid_1's rmse: 0.910326
[280]	training's rmse: 0.831951	valid_1's rmse: 0.910274
[281]	training's rmse: 0.831748	valid_1's rmse: 0.910248
[282]	training's rmse: 0.831522	valid_1's rmse: 0.910199
[283]	training's rmse: 0.831255	valid_1's rmse: 0.910047
[284]	training's rmse: 0.831042	valid_1's rmse: 0.910035
[285]	training's rmse: 0.830673	valid_1's rmse: 0.909854
[286]	training's rmse: 0.830449	valid_1's rmse: 0.90982
[287]	training's rmse: 0.830237	valid_1's rmse: 0.909769
[288]	training's rmse: 0.830059	valid_1's rmse: 0.909717
[289]	training's rmse: 0.829856	valid_1's rmse: 0.909634
[290]	training's rmse: 0.829536	valid_1's rmse: 0.909456
[291]	training's rmse: 0.829362	valid_1's rmse: 0.909411
[292]	training's rmse: 0.829157	valid_1's rmse: 0.909287
[293]	training's rmse: 0.829023	

[392]	training's rmse: 0.812628	valid_1's rmse: 0.906017
[393]	training's rmse: 0.812521	valid_1's rmse: 0.905958
[394]	training's rmse: 0.812413	valid_1's rmse: 0.905962
[395]	training's rmse: 0.812266	valid_1's rmse: 0.905899
[396]	training's rmse: 0.812161	valid_1's rmse: 0.905929
[397]	training's rmse: 0.812046	valid_1's rmse: 0.905908
[398]	training's rmse: 0.811972	valid_1's rmse: 0.905909
[399]	training's rmse: 0.811871	valid_1's rmse: 0.905929
[400]	training's rmse: 0.811747	valid_1's rmse: 0.905924
[401]	training's rmse: 0.811608	valid_1's rmse: 0.905904
[402]	training's rmse: 0.811509	valid_1's rmse: 0.905899
[403]	training's rmse: 0.811407	valid_1's rmse: 0.90593
[404]	training's rmse: 0.811288	valid_1's rmse: 0.905945
[405]	training's rmse: 0.8112	valid_1's rmse: 0.905967
[406]	training's rmse: 0.81109	valid_1's rmse: 0.905912
[407]	training's rmse: 0.810977	valid_1's rmse: 0.9059
[408]	training's rmse: 0.810852	valid_1's rmse: 0.90585
[409]	training's rmse: 0.810708	valid_

[492]	training's rmse: 0.801041	valid_1's rmse: 0.904067
[493]	training's rmse: 0.800992	valid_1's rmse: 0.904067
[494]	training's rmse: 0.800874	valid_1's rmse: 0.904071
[495]	training's rmse: 0.800807	valid_1's rmse: 0.904077
[496]	training's rmse: 0.800706	valid_1's rmse: 0.904121
[497]	training's rmse: 0.800595	valid_1's rmse: 0.904123
[498]	training's rmse: 0.800545	valid_1's rmse: 0.904116
[499]	training's rmse: 0.800474	valid_1's rmse: 0.904118
[500]	training's rmse: 0.800418	valid_1's rmse: 0.904122
[501]	training's rmse: 0.800348	valid_1's rmse: 0.904146
[502]	training's rmse: 0.800143	valid_1's rmse: 0.904232
[503]	training's rmse: 0.800007	valid_1's rmse: 0.904194
[504]	training's rmse: 0.799924	valid_1's rmse: 0.904126
[505]	training's rmse: 0.799875	valid_1's rmse: 0.904134
[506]	training's rmse: 0.799799	valid_1's rmse: 0.904122
[507]	training's rmse: 0.799715	valid_1's rmse: 0.904074
[508]	training's rmse: 0.799643	valid_1's rmse: 0.904103
[509]	training's rmse: 0.799586

[581]	training's rmse: 0.793396	valid_1's rmse: 0.903754
[582]	training's rmse: 0.793337	valid_1's rmse: 0.903753
[583]	training's rmse: 0.793286	valid_1's rmse: 0.903755
[584]	training's rmse: 0.79323	valid_1's rmse: 0.903778
[585]	training's rmse: 0.793155	valid_1's rmse: 0.903779
[586]	training's rmse: 0.793111	valid_1's rmse: 0.903785
[587]	training's rmse: 0.793064	valid_1's rmse: 0.903771
[588]	training's rmse: 0.793004	valid_1's rmse: 0.903826
[589]	training's rmse: 0.792953	valid_1's rmse: 0.903805
[590]	training's rmse: 0.792878	valid_1's rmse: 0.903774
[591]	training's rmse: 0.792842	valid_1's rmse: 0.903781
[592]	training's rmse: 0.7928	valid_1's rmse: 0.903785
[593]	training's rmse: 0.792725	valid_1's rmse: 0.903782
[594]	training's rmse: 0.792682	valid_1's rmse: 0.903786
[595]	training's rmse: 0.792628	valid_1's rmse: 0.903806
[596]	training's rmse: 0.792438	valid_1's rmse: 0.903713
[597]	training's rmse: 0.792168	valid_1's rmse: 0.903858
[598]	training's rmse: 0.792124	va

[663]	training's rmse: 0.787875	valid_1's rmse: 0.903289
[664]	training's rmse: 0.787835	valid_1's rmse: 0.903292
[665]	training's rmse: 0.787794	valid_1's rmse: 0.903311
[666]	training's rmse: 0.787747	valid_1's rmse: 0.903305
[667]	training's rmse: 0.787697	valid_1's rmse: 0.903317
[668]	training's rmse: 0.7876	valid_1's rmse: 0.903392
[669]	training's rmse: 0.787566	valid_1's rmse: 0.903395
[670]	training's rmse: 0.787511	valid_1's rmse: 0.903352
[671]	training's rmse: 0.787435	valid_1's rmse: 0.903356
[672]	training's rmse: 0.787406	valid_1's rmse: 0.903367
[673]	training's rmse: 0.787358	valid_1's rmse: 0.903357
[674]	training's rmse: 0.787296	valid_1's rmse: 0.903369
[675]	training's rmse: 0.787245	valid_1's rmse: 0.90343
[676]	training's rmse: 0.787175	valid_1's rmse: 0.903403
[677]	training's rmse: 0.787122	valid_1's rmse: 0.903439
[678]	training's rmse: 0.787059	valid_1's rmse: 0.903415
[679]	training's rmse: 0.787006	valid_1's rmse: 0.903435
[680]	training's rmse: 0.786953	va

[743]	training's rmse: 0.783299	valid_1's rmse: 0.902701
[744]	training's rmse: 0.783269	valid_1's rmse: 0.902693
[745]	training's rmse: 0.783231	valid_1's rmse: 0.902678
[746]	training's rmse: 0.783202	valid_1's rmse: 0.902674
[747]	training's rmse: 0.783163	valid_1's rmse: 0.902694
[748]	training's rmse: 0.78312	valid_1's rmse: 0.90268
[749]	training's rmse: 0.783067	valid_1's rmse: 0.902688
[750]	training's rmse: 0.783006	valid_1's rmse: 0.902667
[751]	training's rmse: 0.782953	valid_1's rmse: 0.902632
[752]	training's rmse: 0.782879	valid_1's rmse: 0.902625
[753]	training's rmse: 0.782827	valid_1's rmse: 0.902642
[754]	training's rmse: 0.782771	valid_1's rmse: 0.902701
[755]	training's rmse: 0.782747	valid_1's rmse: 0.902697
[756]	training's rmse: 0.782716	valid_1's rmse: 0.902694
[757]	training's rmse: 0.782672	valid_1's rmse: 0.902705
[758]	training's rmse: 0.782627	valid_1's rmse: 0.902702
[759]	training's rmse: 0.782542	valid_1's rmse: 0.902686
[760]	training's rmse: 0.782449	v

[829]	training's rmse: 0.779112	valid_1's rmse: 0.902818
[830]	training's rmse: 0.779068	valid_1's rmse: 0.902775
[831]	training's rmse: 0.779015	valid_1's rmse: 0.902763
[832]	training's rmse: 0.778977	valid_1's rmse: 0.902755
[833]	training's rmse: 0.778934	valid_1's rmse: 0.902782
[834]	training's rmse: 0.778894	valid_1's rmse: 0.902785
[835]	training's rmse: 0.778853	valid_1's rmse: 0.902821
[836]	training's rmse: 0.77883	valid_1's rmse: 0.902819
[837]	training's rmse: 0.778793	valid_1's rmse: 0.902814
[838]	training's rmse: 0.778762	valid_1's rmse: 0.902824
[839]	training's rmse: 0.77872	valid_1's rmse: 0.902817
[840]	training's rmse: 0.778679	valid_1's rmse: 0.902847
[841]	training's rmse: 0.778615	valid_1's rmse: 0.902858
[842]	training's rmse: 0.778532	valid_1's rmse: 0.902838
[843]	training's rmse: 0.778491	valid_1's rmse: 0.902839
[844]	training's rmse: 0.778459	valid_1's rmse: 0.902851
[845]	training's rmse: 0.778415	valid_1's rmse: 0.902869
[846]	training's rmse: 0.778388	v

[917]	training's rmse: 0.775219	valid_1's rmse: 0.902939
[918]	training's rmse: 0.775192	valid_1's rmse: 0.902942
[919]	training's rmse: 0.775149	valid_1's rmse: 0.90292
[920]	training's rmse: 0.775106	valid_1's rmse: 0.90291
[921]	training's rmse: 0.775067	valid_1's rmse: 0.902945
[922]	training's rmse: 0.775009	valid_1's rmse: 0.902878
[923]	training's rmse: 0.77488	valid_1's rmse: 0.90288
[924]	training's rmse: 0.774854	valid_1's rmse: 0.902929
[925]	training's rmse: 0.774829	valid_1's rmse: 0.902932
[926]	training's rmse: 0.774795	valid_1's rmse: 0.902929
[927]	training's rmse: 0.774763	valid_1's rmse: 0.902922
[928]	training's rmse: 0.774719	valid_1's rmse: 0.902914
[929]	training's rmse: 0.77467	valid_1's rmse: 0.902911
[930]	training's rmse: 0.774627	valid_1's rmse: 0.902919
[931]	training's rmse: 0.774603	valid_1's rmse: 0.902914
[932]	training's rmse: 0.774568	valid_1's rmse: 0.902909
[933]	training's rmse: 0.774545	valid_1's rmse: 0.902909
[934]	training's rmse: 0.774515	vali

[998]	training's rmse: 0.771994	valid_1's rmse: 0.902644
[999]	training's rmse: 0.771959	valid_1's rmse: 0.902662
[1000]	training's rmse: 0.771933	valid_1's rmse: 0.902648


In [19]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
# label_test = testData['item_cnt_month']
X_test = testData.drop('item_cnt_month', axis=1)

X_test_transform = rs.transform(X_test)

# 预测&生成文件
y_test = lgb_model.predict(X_test_transform).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})
submission.to_csv('./submit/sub16.csv', index=False)