In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)   # 去掉第一层索引
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df


# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)



# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)



# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)] = (matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

# 月销量（商店类型）
group = matrix.groupby(['date_block_num','shop_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shoptype_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_shoptype_avg_item_cnt')
matrix.drop('date_shoptype_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店类型）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_shoptype_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_item_shoptype_avg_item_cnt')
matrix.drop('date_item_shoptype_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店-商品）
group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shopitem_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,4,5], 'date_shopitem_avg_item_cnt')
matrix.drop('date_shopitem_avg_item_cnt', axis=1, inplace=True)

# 趋势特征--item_cnt_month
group = matrix.groupby('item_id').agg({'item_cnt_month': 'mean'})
group.columns = ['trend_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['trend_date_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id'], how='left')

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5]
matrix = lag_features(matrix, lags, 'trend_date_item_avg_cnt_month')
for i in lags:
    matrix['delta_cnt_month_lag_' + str(i)] = (matrix['trend_date_item_avg_cnt_month_lag_' + str(i)] - matrix[
        'trend_item_avg_cnt_month']) / matrix['trend_item_avg_cnt_month']


def select_trend2(row):
    for i in lags:
        if pd.notnull(row['delta_cnt_month_lag_' + str(i)]):  # 如果不是NaN
            return row['delta_cnt_month_lag_' + str(i)]
    return 0  # 如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势


matrix['delta_cnt_month_lag'] = matrix.apply(select_trend2, axis=1)

matrix['delta_cnt_month_lag'] = matrix['delta_cnt_month_lag'].astype(np.float16)

features_to_drop = ['trend_item_avg_cnt_month', 'trend_date_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['trend_date_item_avg_cnt_month_lag_' + str(i)]
    features_to_drop += ['delta_cnt_month_lag_' + str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)


# 趋势特征 +++++++++++++++++++++++++++++++++++++
group = matrix.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_shop_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['shop_id', 'item_id'], how='left')

# 计算matrix中商品的历史价格
matrix = lag_features(matrix, [1,2,3,4,5], 'item_cnt_month')
                               
for i in lags:
    matrix['delta2_cnt_month_lag_'+str(i)] = (matrix['item_cnt_month_lag_' + str(i)] - matrix['qushi_shop_item_avg_cnt_month']) / matrix['qushi_shop_item_avg_cnt_month']

def select_trend3(row):
    for i in lags:
        if pd.notnull(row['delta2_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta2_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta2_cnt_month_lag'] = matrix.apply(select_trend3, axis=1)
matrix['delta2_cnt_month_lag'] = matrix['delta2_cnt_month_lag'].astype(np.float16)

features_to_drop = ['qushi_shop_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['delta2_cnt_month_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)



In [3]:
matrix.shape

(11128050, 81)

In [5]:
# matrix_11128050_81_bak = matrix.copy()   # 已运行

In [6]:
# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 4]

In [7]:
matrix.shape

(9255330, 81)

In [9]:
dict(matrix.count())

{'date_block_num': 9255330,
 'shop_id': 9255330,
 'item_id': 9255330,
 'item_cnt_month': 9255330,
 'item_category_id': 9255330,
 'cat_type_code': 9255330,
 'cat_subtype_code': 9255330,
 'shop_city_code': 9255330,
 'shop_type_code': 9255330,
 'date_avg_item_cnt_lag_1': 7358574,
 'date_avg_item_cnt_lag_2': 7072182,
 'date_avg_item_cnt_lag_3': 6798279,
 'date_avg_item_cnt_lag_4': 6513575,
 'date_avg_item_cnt_lag_5': 6222805,
 'date_item_avg_item_cnt_lag_1': 7358574,
 'date_item_avg_item_cnt_lag_2': 7072182,
 'date_item_avg_item_cnt_lag_3': 6798279,
 'date_item_avg_item_cnt_lag_4': 6513575,
 'date_item_avg_item_cnt_lag_5': 6222805,
 'date_shop_avg_item_cnt_lag_1': 7358574,
 'date_shop_avg_item_cnt_lag_2': 7072182,
 'date_shop_avg_item_cnt_lag_3': 6798279,
 'date_shop_avg_item_cnt_lag_4': 6513575,
 'date_shop_avg_item_cnt_lag_5': 6222805,
 'date_cat_avg_item_cnt_lag_1': 7358574,
 'date_cat_avg_item_cnt_lag_2': 7072182,
 'date_cat_avg_item_cnt_lag_3': 6798279,
 'date_cat_avg_item_cnt_lag_4':

In [10]:
# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

In [12]:
dict(matrix.count())

{'date_block_num': 9255330,
 'shop_id': 9255330,
 'item_id': 9255330,
 'item_cnt_month': 9255330,
 'item_category_id': 9255330,
 'cat_type_code': 9255330,
 'cat_subtype_code': 9255330,
 'shop_city_code': 9255330,
 'shop_type_code': 9255330,
 'date_avg_item_cnt_lag_1': 9255330,
 'date_avg_item_cnt_lag_2': 9255330,
 'date_avg_item_cnt_lag_3': 9255330,
 'date_avg_item_cnt_lag_4': 9255330,
 'date_avg_item_cnt_lag_5': 9255330,
 'date_item_avg_item_cnt_lag_1': 9255330,
 'date_item_avg_item_cnt_lag_2': 9255330,
 'date_item_avg_item_cnt_lag_3': 9255330,
 'date_item_avg_item_cnt_lag_4': 9255330,
 'date_item_avg_item_cnt_lag_5': 9255330,
 'date_shop_avg_item_cnt_lag_1': 9255330,
 'date_shop_avg_item_cnt_lag_2': 9255330,
 'date_shop_avg_item_cnt_lag_3': 9255330,
 'date_shop_avg_item_cnt_lag_4': 9255330,
 'date_shop_avg_item_cnt_lag_5': 9255330,
 'date_cat_avg_item_cnt_lag_1': 9255330,
 'date_cat_avg_item_cnt_lag_2': 9255330,
 'date_cat_avg_item_cnt_lag_3': 9255330,
 'date_cat_avg_item_cnt_lag_4':

In [13]:
"""建模"""

trainData = matrix[matrix['date_block_num'] < 33]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

In [15]:
import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13541
[LightGBM] [Info] Number of data points in the train set: 8802958, number of used features: 80
[LightGBM] [Info] Start training from score 0.296062
[1]	training's rmse: 1.21589	valid_1's rmse: 1.13306
[2]	training's rmse: 1.21108	valid_1's rmse: 1.12934
[3]	training's rmse: 1.20633	valid_1's rmse: 1.12595
[4]	training's rmse: 1.20112	valid_1's rmse: 1.122
[5]	training's rmse: 1.19603	valid_1's rmse: 1.11792
[6]	training's rmse: 1.19133	valid_1's rmse: 1.11441
[7]	training's rmse: 1.18671	valid_1's rmse: 1.11092
[8]	training's rmse: 1.18197	valid_1's rmse: 1.10721
[9]	training's rmse: 1.17757	valid_1's rmse: 1.10384
[10]	training's rmse: 1.17302	valid_1's rmse: 1.10028
[11]	training's rmse: 1.16833	valid_1's rmse: 1.09699
[12]	training's rmse: 1.16392	valid_1's rmse: 1.09406
[13]	training's rmse: 1.15935	valid_1's rmse: 1.09062
[14]

[139]	training's rmse: 0.893292	valid_1's rmse: 0.921315
[140]	training's rmse: 0.892635	valid_1's rmse: 0.92105
[141]	training's rmse: 0.891913	valid_1's rmse: 0.920814
[142]	training's rmse: 0.891213	valid_1's rmse: 0.920577
[143]	training's rmse: 0.890417	valid_1's rmse: 0.920437
[144]	training's rmse: 0.889676	valid_1's rmse: 0.92022
[145]	training's rmse: 0.88854	valid_1's rmse: 0.920089
[146]	training's rmse: 0.887872	valid_1's rmse: 0.9199
[147]	training's rmse: 0.887203	valid_1's rmse: 0.91957
[148]	training's rmse: 0.886352	valid_1's rmse: 0.919307
[149]	training's rmse: 0.885253	valid_1's rmse: 0.91875
[150]	training's rmse: 0.884436	valid_1's rmse: 0.918598
[151]	training's rmse: 0.883433	valid_1's rmse: 0.918116
[152]	training's rmse: 0.88248	valid_1's rmse: 0.917707
[153]	training's rmse: 0.881492	valid_1's rmse: 0.917295
[154]	training's rmse: 0.880875	valid_1's rmse: 0.917147
[155]	training's rmse: 0.880192	valid_1's rmse: 0.91693
[156]	training's rmse: 0.87942	valid_1's

[284]	training's rmse: 0.824361	valid_1's rmse: 0.906761
[285]	training's rmse: 0.823991	valid_1's rmse: 0.906659
[286]	training's rmse: 0.823539	valid_1's rmse: 0.906798
[287]	training's rmse: 0.823343	valid_1's rmse: 0.90677
[288]	training's rmse: 0.823029	valid_1's rmse: 0.906815
[289]	training's rmse: 0.822873	valid_1's rmse: 0.906784
[290]	training's rmse: 0.822574	valid_1's rmse: 0.906696
[291]	training's rmse: 0.822181	valid_1's rmse: 0.906833
[292]	training's rmse: 0.821881	valid_1's rmse: 0.90675
[293]	training's rmse: 0.82153	valid_1's rmse: 0.90709
[294]	training's rmse: 0.821175	valid_1's rmse: 0.907376
[295]	training's rmse: 0.820875	valid_1's rmse: 0.907388
[296]	training's rmse: 0.82068	valid_1's rmse: 0.907764
[297]	training's rmse: 0.820179	valid_1's rmse: 0.908041
[298]	training's rmse: 0.819896	valid_1's rmse: 0.90791
[299]	training's rmse: 0.819537	valid_1's rmse: 0.907823
[300]	training's rmse: 0.819356	valid_1's rmse: 0.907841
[301]	training's rmse: 0.819156	valid

[429]	training's rmse: 0.795405	valid_1's rmse: 0.905635
[430]	training's rmse: 0.795327	valid_1's rmse: 0.905712
[431]	training's rmse: 0.795196	valid_1's rmse: 0.905724
[432]	training's rmse: 0.795112	valid_1's rmse: 0.905699
[433]	training's rmse: 0.794942	valid_1's rmse: 0.905632
[434]	training's rmse: 0.794779	valid_1's rmse: 0.905698
[435]	training's rmse: 0.794666	valid_1's rmse: 0.905715
[436]	training's rmse: 0.794486	valid_1's rmse: 0.90566
[437]	training's rmse: 0.794353	valid_1's rmse: 0.905633
[438]	training's rmse: 0.794213	valid_1's rmse: 0.905748
[439]	training's rmse: 0.794095	valid_1's rmse: 0.905744
[440]	training's rmse: 0.793986	valid_1's rmse: 0.905758
[441]	training's rmse: 0.793906	valid_1's rmse: 0.905754
[442]	training's rmse: 0.793794	valid_1's rmse: 0.905759
[443]	training's rmse: 0.793665	valid_1's rmse: 0.905703
[444]	training's rmse: 0.793555	valid_1's rmse: 0.905665
[445]	training's rmse: 0.793432	valid_1's rmse: 0.905683
[446]	training's rmse: 0.79323	v

[574]	training's rmse: 0.77917	valid_1's rmse: 0.905437
[575]	training's rmse: 0.779069	valid_1's rmse: 0.90542
[576]	training's rmse: 0.778964	valid_1's rmse: 0.905395
[577]	training's rmse: 0.778838	valid_1's rmse: 0.905385
[578]	training's rmse: 0.778747	valid_1's rmse: 0.905385
[579]	training's rmse: 0.778692	valid_1's rmse: 0.905385
[580]	training's rmse: 0.778623	valid_1's rmse: 0.905371
[581]	training's rmse: 0.778554	valid_1's rmse: 0.905365
[582]	training's rmse: 0.778462	valid_1's rmse: 0.905359
[583]	training's rmse: 0.778383	valid_1's rmse: 0.905377
[584]	training's rmse: 0.778225	valid_1's rmse: 0.905263
[585]	training's rmse: 0.77815	valid_1's rmse: 0.905276
[586]	training's rmse: 0.778064	valid_1's rmse: 0.905256
[587]	training's rmse: 0.777971	valid_1's rmse: 0.905247
[588]	training's rmse: 0.777548	valid_1's rmse: 0.905808
[589]	training's rmse: 0.77741	valid_1's rmse: 0.905711
[590]	training's rmse: 0.777317	valid_1's rmse: 0.905715
[591]	training's rmse: 0.777261	val

[719]	training's rmse: 0.766573	valid_1's rmse: 0.905098
[720]	training's rmse: 0.766518	valid_1's rmse: 0.905097
[721]	training's rmse: 0.766464	valid_1's rmse: 0.905098
[722]	training's rmse: 0.766414	valid_1's rmse: 0.905092
[723]	training's rmse: 0.766365	valid_1's rmse: 0.905106
[724]	training's rmse: 0.766298	valid_1's rmse: 0.905076
[725]	training's rmse: 0.766217	valid_1's rmse: 0.905067
[726]	training's rmse: 0.766168	valid_1's rmse: 0.905044
[727]	training's rmse: 0.766119	valid_1's rmse: 0.905061
[728]	training's rmse: 0.766054	valid_1's rmse: 0.905098
[729]	training's rmse: 0.766	valid_1's rmse: 0.905102
[730]	training's rmse: 0.765927	valid_1's rmse: 0.905108
[731]	training's rmse: 0.765866	valid_1's rmse: 0.905096
[732]	training's rmse: 0.765815	valid_1's rmse: 0.905104
[733]	training's rmse: 0.765776	valid_1's rmse: 0.905113
[734]	training's rmse: 0.765723	valid_1's rmse: 0.905111
[735]	training's rmse: 0.765657	valid_1's rmse: 0.905125
[736]	training's rmse: 0.76562	val

[864]	training's rmse: 0.757774	valid_1's rmse: 0.902837
[865]	training's rmse: 0.757726	valid_1's rmse: 0.902839
[866]	training's rmse: 0.757674	valid_1's rmse: 0.902764
[867]	training's rmse: 0.757619	valid_1's rmse: 0.902759
[868]	training's rmse: 0.757569	valid_1's rmse: 0.90278
[869]	training's rmse: 0.757515	valid_1's rmse: 0.902745
[870]	training's rmse: 0.757474	valid_1's rmse: 0.902712
[871]	training's rmse: 0.75744	valid_1's rmse: 0.902704
[872]	training's rmse: 0.757394	valid_1's rmse: 0.902691
[873]	training's rmse: 0.757341	valid_1's rmse: 0.902684
[874]	training's rmse: 0.757302	valid_1's rmse: 0.902651
[875]	training's rmse: 0.757268	valid_1's rmse: 0.902638
[876]	training's rmse: 0.757222	valid_1's rmse: 0.902598
[877]	training's rmse: 0.757183	valid_1's rmse: 0.902639
[878]	training's rmse: 0.757113	valid_1's rmse: 0.90264
[879]	training's rmse: 0.757055	valid_1's rmse: 0.902642
[880]	training's rmse: 0.757019	valid_1's rmse: 0.902594
[881]	training's rmse: 0.756957	va

# 训练33个月

In [18]:
trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

# validData = matrix[matrix['date_block_num'] == 33]
# label_valid = validData['item_cnt_month']
# X_valid = validData.drop('item_cnt_month', axis=1)

train_data = lgb.Dataset(data=X_train, label=label_train)
# valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13563
[LightGBM] [Info] Number of data points in the train set: 9041130, number of used features: 80
[LightGBM] [Info] Start training from score 0.295075
[1]	training's rmse: 1.21373
[2]	training's rmse: 1.20891
[3]	training's rmse: 1.20415
[4]	training's rmse: 1.19893
[5]	training's rmse: 1.19383
[6]	training's rmse: 1.18915
[7]	training's rmse: 1.18451
[8]	training's rmse: 1.17974
[9]	training's rmse: 1.17531
[10]	training's rmse: 1.17079
[11]	training's rmse: 1.16611
[12]	training's rmse: 1.16164
[13]	training's rmse: 1.15707
[14]	training's rmse: 1.15289
[15]	training's rmse: 1.14856
[16]	training's rmse: 1.14534
[17]	training's rmse: 1.14061
[18]	training's rmse: 1.13634
[19]	training's rmse: 1.13237
[20]	training's rmse: 1.12829
[21]	training's rmse: 1.12453
[22]	training's rmse: 1.12076
[23]	training's rmse: 1.11714
[24]	training'

[247]	training's rmse: 0.830423
[248]	training's rmse: 0.829891
[249]	training's rmse: 0.829568
[250]	training's rmse: 0.829098
[251]	training's rmse: 0.828706
[252]	training's rmse: 0.82853
[253]	training's rmse: 0.828218
[254]	training's rmse: 0.827774
[255]	training's rmse: 0.827401
[256]	training's rmse: 0.827201
[257]	training's rmse: 0.826776
[258]	training's rmse: 0.826564
[259]	training's rmse: 0.826366
[260]	training's rmse: 0.826037
[261]	training's rmse: 0.825792
[262]	training's rmse: 0.825431
[263]	training's rmse: 0.824982
[264]	training's rmse: 0.824606
[265]	training's rmse: 0.824006
[266]	training's rmse: 0.823742
[267]	training's rmse: 0.823465
[268]	training's rmse: 0.823087
[269]	training's rmse: 0.822787
[270]	training's rmse: 0.822443
[271]	training's rmse: 0.822267
[272]	training's rmse: 0.821924
[273]	training's rmse: 0.821662
[274]	training's rmse: 0.821324
[275]	training's rmse: 0.821101
[276]	training's rmse: 0.820923
[277]	training's rmse: 0.820593
[278]	tra

[504]	training's rmse: 0.778853
[505]	training's rmse: 0.778751
[506]	training's rmse: 0.778675
[507]	training's rmse: 0.778602
[508]	training's rmse: 0.77852
[509]	training's rmse: 0.778416
[510]	training's rmse: 0.778338
[511]	training's rmse: 0.778216
[512]	training's rmse: 0.778145
[513]	training's rmse: 0.778049
[514]	training's rmse: 0.777965
[515]	training's rmse: 0.777855
[516]	training's rmse: 0.77779
[517]	training's rmse: 0.777682
[518]	training's rmse: 0.777483
[519]	training's rmse: 0.777412
[520]	training's rmse: 0.777094
[521]	training's rmse: 0.776968
[522]	training's rmse: 0.776884
[523]	training's rmse: 0.776796
[524]	training's rmse: 0.776718
[525]	training's rmse: 0.776524
[526]	training's rmse: 0.776405
[527]	training's rmse: 0.776243
[528]	training's rmse: 0.776143
[529]	training's rmse: 0.776041
[530]	training's rmse: 0.775958
[531]	training's rmse: 0.775887
[532]	training's rmse: 0.775804
[533]	training's rmse: 0.775621
[534]	training's rmse: 0.775513
[535]	trai

[761]	training's rmse: 0.756752
[762]	training's rmse: 0.756689
[763]	training's rmse: 0.75661
[764]	training's rmse: 0.75657
[765]	training's rmse: 0.756326
[766]	training's rmse: 0.75627
[767]	training's rmse: 0.756213
[768]	training's rmse: 0.756151
[769]	training's rmse: 0.756065
[770]	training's rmse: 0.756005
[771]	training's rmse: 0.75594
[772]	training's rmse: 0.75589
[773]	training's rmse: 0.755844
[774]	training's rmse: 0.755781
[775]	training's rmse: 0.75573
[776]	training's rmse: 0.755686
[777]	training's rmse: 0.755617
[778]	training's rmse: 0.755579
[779]	training's rmse: 0.755542
[780]	training's rmse: 0.755497
[781]	training's rmse: 0.755423
[782]	training's rmse: 0.755381
[783]	training's rmse: 0.755341
[784]	training's rmse: 0.755286
[785]	training's rmse: 0.755195
[786]	training's rmse: 0.755137
[787]	training's rmse: 0.755084
[788]	training's rmse: 0.755018
[789]	training's rmse: 0.754954
[790]	training's rmse: 0.754892
[791]	training's rmse: 0.75482
[792]	training'

In [19]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
X_test = testData.drop('item_cnt_month', axis=1)

# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)
submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub1.csv', index=False)