In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
# six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
# six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    ic(name)
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)]=(matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0

matrix['delta_price_lag']=matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

ic| name: '!Якутск Орджоникидзе, 56 фран'
ic| name: '!Якутск ТЦ "Центральный" фран'
ic| name: 'Адыгея ТЦ "Мега"'
ic| name: 'Балашиха ТРК "Октябрь-Киномир"'
ic| name: 'Волжский ТЦ "Волга Молл"'
ic| name: 'Вологда ТРЦ "Мармелад"'
ic| name: 'Воронеж (Плехановская, 13)'
ic| name: 'Воронеж ТРЦ "Максимир"'
ic| name: 'Воронеж ТРЦ Сити-Парк "Град"'
ic| name: 'Выездная Торговля'
ic| name: 'Жуковский ул. Чкалова 39м?'
ic| name: 'Жуковский ул. Чкалова 39м²'
ic| name: 'Интернет-магазин ЧС'
ic| name: 'Казань ТЦ "Бехетле"'
ic| name: 'Казань ТЦ "ПаркХаус" II'
ic| name: 'Калуга ТРЦ "XXI век"'
ic| name: 'Коломна ТЦ "Рио"'
ic| name: 'Красноярск ТЦ "Взлетка Плаза"'
ic| name: 'Красноярск ТЦ "Июнь"'
ic| name: 'Курск ТЦ "Пушкинский"'
ic| name: 'Москва "Распродажа"'
ic| name: 'Москва МТРЦ "Афи Молл"'
ic| name: 'Москва Магазин С21'
ic| name: 'Москва ТК "Буденовский" (пав.А2)'
ic| name: 'Москва ТК "Буденовский" (пав.К7)'
ic| name: 'Москва ТРК "Атриум"'
ic| name: 'Москва ТЦ "Ареал" (Беляево)'
ic| name: 'Москва 

In [7]:
sales_by_shop_id

Unnamed: 0,shop_id,0,1,2,3,4,5,6,7,8,...,24,25,26,27,28,29,30,31,32,33
0,0,5578,6127,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2947,3364,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1146,488,753,583,553,832,807,875,945,...,987,907,762,859,843,804,785,942,822,727
3,3,767,798,959,647,710,784,721,890,1026,...,977,738,741,740,731,672,535,666,745,613
4,4,2114,2025,2060,285,1211,1464,1378,1713,1536,...,1188,980,978,899,893,793,842,947,732,831
5,5,0,877,1355,1008,1110,1393,1265,1510,1298,...,1404,1101,1109,1054,1012,954,991,1294,1092,1052
6,6,3686,4007,4519,3168,3022,3847,3360,3702,4208,...,2328,2329,1981,1998,1748,1539,1484,1575,1725,1802
7,7,2495,2513,2460,1540,1647,2085,2031,2397,2599,...,2084,1847,1430,1340,1217,1235,1327,1409,1287,1212
8,8,1463,1156,977,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,0,0,0,0,0,0,0,0,0,...,0,0,0,-1,0,0,0,0,0,3186


In [27]:
shop0 = [0,1,8,11,13,17,23,29,30,32,33,40,43,54]

In [28]:
test[test.shop_id.isin(shop0)]

Unnamed: 0,ID,shop_id,item_id,date_block_num


In [8]:
sales_by_item_id

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,24,25,26,27,28,29,30,31,32,33
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21802,22165,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21803,22166,0,0,0,0,0,0,0,0,0,...,7,8,12,4,8,10,8,11,5,11
21804,22167,0,0,0,0,0,0,0,0,56,...,33,46,40,38,31,33,34,29,21,37
21805,22168,2,2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
six_zero_item_id

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 34,
 35,
 36,
 37,
 41,
 43,
 44,
 46,
 47,
 48,
 50,
 52,
 54,
 55,
 56,
 58,
 60,
 61,
 63,
 65,
 66,
 68,
 71,
 72,
 73,
 74,
 75,
 78,
 81,
 82,
 84,
 85,
 86,
 88,
 89,
 90,
 92,
 94,
 95,
 96,
 97,
 98,
 99,
 101,
 102,
 105,
 106,
 108,
 111,
 112,
 114,
 115,
 116,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 141,
 144,
 145,
 151,
 152,
 164,
 165,
 169,
 170,
 175,
 176,
 177,
 178,
 179,
 197,
 198,
 199,
 200,
 209,
 210,
 215,
 218,
 220,
 221,
 222,
 223,
 224,
 227,
 230,
 232,
 233,
 235,
 236,
 242,
 243,
 244,
 246,
 247,
 250,
 251,
 256,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 270,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 282,
 285,
 290,
 297,
 300,
 303,
 305,
 306,
 307,
 309,
 310,
 311,
 312,
 316,
 317,
 318,
 319,
 320,
 32

In [12]:
test[test.item_id.isin(six_zero_item_id)]

Unnamed: 0,ID,shop_id,item_id,date_block_num
298,298,5,18729,34
493,493,5,562,34
817,817,5,18698,34
953,953,5,19813,34
1165,1165,5,12715,34
...,...,...,...,...
214165,214165,45,18589,34
214167,214167,45,11467,34
214176,214176,45,11137,34
214179,214179,45,2972,34


In [25]:
"""建模"""

trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 34]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {'objective': 'regression', 'metric': 'rmse', 'n_estimators': 1000, 'num_leaves': 100, 'learning_rate': 0.01, 'bagging_fraction': 0.9, 'feature_fraction': 0.3, 'bagging_seed': 0, 'early_stop_rounds': 50}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10825
[LightGBM] [Info] Number of data points in the train set: 6425094, number of used features: 63
[LightGBM] [Info] Start training from score 0.287729
[1]	training's rmse: 1.1812	valid_1's rmse: 0.287773
[2]	training's rmse: 1.17636	valid_1's rmse: 0.287916
[3]	training's rmse: 1.17265	valid_1's rmse: 0.288295
[4]	training's rmse: 1.16911	valid_1's rmse: 0.288589
[5]	training's rmse: 1.16428	valid_1's rmse: 0.289325
[6]	training's rmse: 1.16092	valid_1's rmse: 0.289779
[7]	training's rmse: 1.1569	valid_1's rmse: 0.290706
[8]	training's rmse: 1.15241	valid_1's rmse: 0.291447
[9]	training's rmse: 1.14828	valid_1's rmse: 0.292232
[10]	training's rmse: 1.14403	valid_1's rmse: 0.293108
[11]	training's rmse: 1.13955	valid_1's rmse: 0.294572
[12]	training's rmse: 1.13539	valid_1's rmse: 0.295817
[13]	training's rmse: 1.13136	valid_1's rmse: 

[138]	training's rmse: 0.889319	valid_1's rmse: 0.582693
[139]	training's rmse: 0.888532	valid_1's rmse: 0.584608
[140]	training's rmse: 0.887839	valid_1's rmse: 0.585813
[141]	training's rmse: 0.887066	valid_1's rmse: 0.587335
[142]	training's rmse: 0.886436	valid_1's rmse: 0.589081
[143]	training's rmse: 0.885452	valid_1's rmse: 0.591464
[144]	training's rmse: 0.884746	valid_1's rmse: 0.593042
[145]	training's rmse: 0.883928	valid_1's rmse: 0.595157
[146]	training's rmse: 0.883185	valid_1's rmse: 0.596892
[147]	training's rmse: 0.882519	valid_1's rmse: 0.598429
[148]	training's rmse: 0.881895	valid_1's rmse: 0.599845
[149]	training's rmse: 0.881192	valid_1's rmse: 0.60167
[150]	training's rmse: 0.880464	valid_1's rmse: 0.603568
[151]	training's rmse: 0.879933	valid_1's rmse: 0.60464
[152]	training's rmse: 0.879257	valid_1's rmse: 0.60631
[153]	training's rmse: 0.878647	valid_1's rmse: 0.607867
[154]	training's rmse: 0.878278	valid_1's rmse: 0.60875
[155]	training's rmse: 0.877586	val

[283]	training's rmse: 0.827401	valid_1's rmse: 0.73961
[284]	training's rmse: 0.827232	valid_1's rmse: 0.740109
[285]	training's rmse: 0.827061	valid_1's rmse: 0.740537
[286]	training's rmse: 0.826741	valid_1's rmse: 0.741227
[287]	training's rmse: 0.826528	valid_1's rmse: 0.741988
[288]	training's rmse: 0.826214	valid_1's rmse: 0.742782
[289]	training's rmse: 0.825986	valid_1's rmse: 0.743888
[290]	training's rmse: 0.82579	valid_1's rmse: 0.744348
[291]	training's rmse: 0.825621	valid_1's rmse: 0.744964
[292]	training's rmse: 0.825449	valid_1's rmse: 0.74544
[293]	training's rmse: 0.825158	valid_1's rmse: 0.746161
[294]	training's rmse: 0.824879	valid_1's rmse: 0.747187
[295]	training's rmse: 0.824726	valid_1's rmse: 0.74756
[296]	training's rmse: 0.824417	valid_1's rmse: 0.748004
[297]	training's rmse: 0.824265	valid_1's rmse: 0.748538
[298]	training's rmse: 0.823985	valid_1's rmse: 0.749629
[299]	training's rmse: 0.823667	valid_1's rmse: 0.750222
[300]	training's rmse: 0.823335	val

[428]	training's rmse: 0.803686	valid_1's rmse: 0.799613
[429]	training's rmse: 0.803502	valid_1's rmse: 0.800019
[430]	training's rmse: 0.803338	valid_1's rmse: 0.800336
[431]	training's rmse: 0.803251	valid_1's rmse: 0.80051
[432]	training's rmse: 0.803117	valid_1's rmse: 0.800693
[433]	training's rmse: 0.803016	valid_1's rmse: 0.801046
[434]	training's rmse: 0.802865	valid_1's rmse: 0.801586
[435]	training's rmse: 0.802744	valid_1's rmse: 0.801679
[436]	training's rmse: 0.802648	valid_1's rmse: 0.801838
[437]	training's rmse: 0.802552	valid_1's rmse: 0.801911
[438]	training's rmse: 0.802368	valid_1's rmse: 0.802535
[439]	training's rmse: 0.802271	valid_1's rmse: 0.802743
[440]	training's rmse: 0.802163	valid_1's rmse: 0.802842
[441]	training's rmse: 0.802059	valid_1's rmse: 0.803167
[442]	training's rmse: 0.801953	valid_1's rmse: 0.803319
[443]	training's rmse: 0.801858	valid_1's rmse: 0.803372
[444]	training's rmse: 0.801767	valid_1's rmse: 0.803404
[445]	training's rmse: 0.801683	

[573]	training's rmse: 0.789619	valid_1's rmse: 0.821374
[574]	training's rmse: 0.789559	valid_1's rmse: 0.821581
[575]	training's rmse: 0.789486	valid_1's rmse: 0.821767
[576]	training's rmse: 0.789412	valid_1's rmse: 0.821742
[577]	training's rmse: 0.789312	valid_1's rmse: 0.821899
[578]	training's rmse: 0.789274	valid_1's rmse: 0.821943
[579]	training's rmse: 0.789223	valid_1's rmse: 0.822073
[580]	training's rmse: 0.789131	valid_1's rmse: 0.822175
[581]	training's rmse: 0.789056	valid_1's rmse: 0.822234
[582]	training's rmse: 0.788958	valid_1's rmse: 0.822408
[583]	training's rmse: 0.788882	valid_1's rmse: 0.822584
[584]	training's rmse: 0.788825	valid_1's rmse: 0.822624
[585]	training's rmse: 0.788774	valid_1's rmse: 0.822584
[586]	training's rmse: 0.78871	valid_1's rmse: 0.822702
[587]	training's rmse: 0.788406	valid_1's rmse: 0.823174
[588]	training's rmse: 0.788359	valid_1's rmse: 0.823244
[589]	training's rmse: 0.788228	valid_1's rmse: 0.823658
[590]	training's rmse: 0.788133	

[718]	training's rmse: 0.779793	valid_1's rmse: 0.836067
[719]	training's rmse: 0.779737	valid_1's rmse: 0.836052
[720]	training's rmse: 0.779698	valid_1's rmse: 0.836135
[721]	training's rmse: 0.779657	valid_1's rmse: 0.836424
[722]	training's rmse: 0.779566	valid_1's rmse: 0.836707
[723]	training's rmse: 0.779511	valid_1's rmse: 0.836738
[724]	training's rmse: 0.779461	valid_1's rmse: 0.836694
[725]	training's rmse: 0.779421	valid_1's rmse: 0.836807
[726]	training's rmse: 0.77935	valid_1's rmse: 0.83668
[727]	training's rmse: 0.779272	valid_1's rmse: 0.836833
[728]	training's rmse: 0.779227	valid_1's rmse: 0.836853
[729]	training's rmse: 0.779185	valid_1's rmse: 0.836797
[730]	training's rmse: 0.778886	valid_1's rmse: 0.836945
[731]	training's rmse: 0.778819	valid_1's rmse: 0.836987
[732]	training's rmse: 0.778783	valid_1's rmse: 0.83698
[733]	training's rmse: 0.778733	valid_1's rmse: 0.837009
[734]	training's rmse: 0.778657	valid_1's rmse: 0.836961
[735]	training's rmse: 0.778596	va

[863]	training's rmse: 0.770631	valid_1's rmse: 0.844645
[864]	training's rmse: 0.770586	valid_1's rmse: 0.844743
[865]	training's rmse: 0.770549	valid_1's rmse: 0.84478
[866]	training's rmse: 0.7705	valid_1's rmse: 0.844751
[867]	training's rmse: 0.770468	valid_1's rmse: 0.844734
[868]	training's rmse: 0.770416	valid_1's rmse: 0.84483
[869]	training's rmse: 0.770357	valid_1's rmse: 0.844838
[870]	training's rmse: 0.770323	valid_1's rmse: 0.844919
[871]	training's rmse: 0.770285	valid_1's rmse: 0.84487
[872]	training's rmse: 0.770252	valid_1's rmse: 0.844856
[873]	training's rmse: 0.770216	valid_1's rmse: 0.844857
[874]	training's rmse: 0.770155	valid_1's rmse: 0.844825
[875]	training's rmse: 0.770104	valid_1's rmse: 0.844808
[876]	training's rmse: 0.770062	valid_1's rmse: 0.844892
[877]	training's rmse: 0.770017	valid_1's rmse: 0.844887
[878]	training's rmse: 0.76997	valid_1's rmse: 0.844879
[879]	training's rmse: 0.769788	valid_1's rmse: 0.845003
[880]	training's rmse: 0.769748	valid

In [26]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
# label_test = testData['item_cnt_month']
X_test = testData.drop('item_cnt_month', axis=1)

# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)

submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub7.csv', index=False)