In [123]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)   # 去掉第一层索引
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
# six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
# six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)] = (matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

In [124]:
matrix1 = matrix.copy()
matrix2 = matrix.copy()

# 分析数值型数据

In [125]:
from scipy.special import boxcox1p, boxcox

In [126]:
columns = []
for col in matrix.columns:
    if col != 'item_cnt_month':
        if stats.skew(matrix[col]) > 0.75:
            columns.append(col)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [127]:
for i in columns:
    print(i, '>> ', stats.skew(matrix[i]))

cat_subtype_code >>  0.8529682866274095
item_cnt_month_lag_1 >>  9.40625
item_cnt_month_lag_2 >>  9.375
item_cnt_month_lag_3 >>  9.3828125
item_cnt_month_lag_6 >>  9.4921875
item_cnt_month_lag_12 >>  10.203125
date_item_avg_item_cnt_lag_1 >>  9.8125
date_item_avg_item_cnt_lag_2 >>  9.8125
date_item_avg_item_cnt_lag_3 >>  9.7890625
date_item_avg_item_cnt_lag_6 >>  9.7890625
date_item_avg_item_cnt_lag_12 >>  10.1875
date_shop_avg_item_cnt_lag_1 >>  2.05859375
date_shop_avg_item_cnt_lag_2 >>  2.044921875
date_shop_avg_item_cnt_lag_3 >>  2.046875
date_shop_avg_item_cnt_lag_6 >>  2.048828125
date_shop_avg_item_cnt_lag_12 >>  2.201171875
date_cat_avg_item_cnt_lag_1 >>  18.171875
date_cat_avg_item_cnt_lag_2 >>  18.296875
date_cat_avg_item_cnt_lag_3 >>  18.390625
date_cat_avg_item_cnt_lag_6 >>  18.734375
date_cat_avg_item_cnt_lag_12 >>  20.1875
date_cat_shop_avg_item_cnt_lag_1 >>  14.3046875
date_cat_shop_avg_item_cnt_lag_2 >>  14.4140625
date_cat_shop_avg_item_cnt_lag_3 >>  14.640625
date_cat

In [128]:
for i in columns:
    matrix[i]=boxcox1p(matrix[i], 0.15)

In [129]:
for i in columns:
    print(i, '>> ', stats.skew(matrix[i]))

cat_subtype_code >>  0.21308243656317116
item_cnt_month_lag_1 >>  3.831051826477051
item_cnt_month_lag_2 >>  3.8560307025909424
item_cnt_month_lag_3 >>  3.8862102031707764
item_cnt_month_lag_6 >>  4.013579368591309
item_cnt_month_lag_12 >>  4.534460067749023
date_item_avg_item_cnt_lag_1 >>  3.9658572673797607
date_item_avg_item_cnt_lag_2 >>  3.969264030456543
date_item_avg_item_cnt_lag_3 >>  3.9657185077667236
date_item_avg_item_cnt_lag_6 >>  4.00478458404541
date_item_avg_item_cnt_lag_12 >>  4.375425338745117
date_shop_avg_item_cnt_lag_1 >>  1.4135289192199707
date_shop_avg_item_cnt_lag_2 >>  1.397884726524353
date_shop_avg_item_cnt_lag_3 >>  1.4012951850891113
date_shop_avg_item_cnt_lag_6 >>  1.4246762990951538
date_shop_avg_item_cnt_lag_12 >>  1.6625432968139648
date_cat_avg_item_cnt_lag_1 >>  3.311107873916626
date_cat_avg_item_cnt_lag_2 >>  3.30843448638916
date_cat_avg_item_cnt_lag_3 >>  3.2901079654693604
date_cat_avg_item_cnt_lag_6 >>  3.340208053588867
date_cat_avg_item_cnt_la

In [150]:
trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 34]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 500,
    'num_leaves': 150,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10825
[LightGBM] [Info] Number of data points in the train set: 6425094, number of used features: 63
[LightGBM] [Info] Start training from score 0.287729
[1]	training's rmse: 1.18109	valid_1's rmse: 0.287752
[2]	training's rmse: 1.17604	valid_1's rmse: 0.287929
[3]	training's rmse: 1.17215	valid_1's rmse: 0.288316
[4]	training's rmse: 1.16849	valid_1's rmse: 0.288682
[5]	training's rmse: 1.16357	valid_1's rmse: 0.289449
[6]	training's rmse: 1.16007	valid_1's rmse: 0.289938
[7]	training's rmse: 1.15593	valid_1's rmse: 0.290899
[8]	training's rmse: 1.15133	valid_1's rmse: 0.291681
[9]	training's rmse: 1.14711	valid_1's rmse: 0.29253
[10]	training's rmse: 1.14264	valid_1's rmse: 0.293474
[11]	training's rmse: 1.13806	valid_1's rmse: 0.295022
[12]	training's rmse: 1.13383	valid_1's rmse: 0.296261
[13]	training's rmse: 1.12973	valid_1's rmse:

[138]	training's rmse: 0.879367	valid_1's rmse: 0.588629
[139]	training's rmse: 0.878578	valid_1's rmse: 0.590415
[140]	training's rmse: 0.87784	valid_1's rmse: 0.591483
[141]	training's rmse: 0.877038	valid_1's rmse: 0.592887
[142]	training's rmse: 0.87638	valid_1's rmse: 0.594614
[143]	training's rmse: 0.875345	valid_1's rmse: 0.597311
[144]	training's rmse: 0.874607	valid_1's rmse: 0.59891
[145]	training's rmse: 0.873767	valid_1's rmse: 0.600967
[146]	training's rmse: 0.873019	valid_1's rmse: 0.602578
[147]	training's rmse: 0.872341	valid_1's rmse: 0.604063
[148]	training's rmse: 0.871711	valid_1's rmse: 0.605635
[149]	training's rmse: 0.870962	valid_1's rmse: 0.60753
[150]	training's rmse: 0.870217	valid_1's rmse: 0.60936
[151]	training's rmse: 0.869672	valid_1's rmse: 0.610578
[152]	training's rmse: 0.868949	valid_1's rmse: 0.612296
[153]	training's rmse: 0.868309	valid_1's rmse: 0.61387
[154]	training's rmse: 0.867909	valid_1's rmse: 0.614759
[155]	training's rmse: 0.867169	valid

[283]	training's rmse: 0.814056	valid_1's rmse: 0.746946
[284]	training's rmse: 0.813881	valid_1's rmse: 0.747428
[285]	training's rmse: 0.813703	valid_1's rmse: 0.747831
[286]	training's rmse: 0.813343	valid_1's rmse: 0.748452
[287]	training's rmse: 0.813113	valid_1's rmse: 0.749438
[288]	training's rmse: 0.812769	valid_1's rmse: 0.750206
[289]	training's rmse: 0.81253	valid_1's rmse: 0.75121
[290]	training's rmse: 0.812326	valid_1's rmse: 0.751598
[291]	training's rmse: 0.812159	valid_1's rmse: 0.752221
[292]	training's rmse: 0.811981	valid_1's rmse: 0.752686
[293]	training's rmse: 0.81169	valid_1's rmse: 0.753376
[294]	training's rmse: 0.81141	valid_1's rmse: 0.754366
[295]	training's rmse: 0.811253	valid_1's rmse: 0.754728
[296]	training's rmse: 0.810911	valid_1's rmse: 0.755126
[297]	training's rmse: 0.810753	valid_1's rmse: 0.755702
[298]	training's rmse: 0.810458	valid_1's rmse: 0.756676
[299]	training's rmse: 0.810144	valid_1's rmse: 0.757426
[300]	training's rmse: 0.809825	val

[428]	training's rmse: 0.788542	valid_1's rmse: 0.806373
[429]	training's rmse: 0.788362	valid_1's rmse: 0.806755
[430]	training's rmse: 0.788179	valid_1's rmse: 0.807068
[431]	training's rmse: 0.788077	valid_1's rmse: 0.807164
[432]	training's rmse: 0.787936	valid_1's rmse: 0.807372
[433]	training's rmse: 0.787789	valid_1's rmse: 0.807691
[434]	training's rmse: 0.787632	valid_1's rmse: 0.808106
[435]	training's rmse: 0.787507	valid_1's rmse: 0.808209
[436]	training's rmse: 0.787403	valid_1's rmse: 0.808337
[437]	training's rmse: 0.787303	valid_1's rmse: 0.808434
[438]	training's rmse: 0.787099	valid_1's rmse: 0.80887
[439]	training's rmse: 0.787001	valid_1's rmse: 0.809158
[440]	training's rmse: 0.786886	valid_1's rmse: 0.809299
[441]	training's rmse: 0.786781	valid_1's rmse: 0.809577
[442]	training's rmse: 0.786659	valid_1's rmse: 0.809614
[443]	training's rmse: 0.78656	valid_1's rmse: 0.809712
[444]	training's rmse: 0.786466	valid_1's rmse: 0.809742
[445]	training's rmse: 0.78635	va

# 验证 

In [151]:
from sklearn.metrics import mean_squared_error
valid_prediction = lgb_model.predict(X_valid).clip(0,20)
rmse_valid = np.sqrt(mean_squared_error(valid_prediction, label_valid))
rmse_valid

0.8183262547941259

In [152]:
# 获取最近6个月销售量为0的数据
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表

In [153]:
len(six_zero_item_id)

12893

In [154]:
test[test.item_id.isin(six_zero_item_id)]

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_cnt_month
298,298,5,18729,34,0.0
493,493,5,562,34,0.0
817,817,5,18698,34,0.0
953,953,5,19813,34,0.0
1165,1165,5,12715,34,0.0
...,...,...,...,...,...
214165,214165,45,18589,34,0.0
214167,214167,45,11467,34,0.0
214176,214176,45,11137,34,0.0
214179,214179,45,2972,34,0.0


In [155]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
X_test = testData.drop('item_cnt_month', axis=1)

# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)
submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub10.csv', index=False)