In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    ic(name)
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# # 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
# group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
# group.columns = ['date_item_type_avg_item_cnt']
# group = group.reset_index()
# matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
# matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
# matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)]=(matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0

matrix['delta_price_lag']=matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')


ic| name: '!Якутск Орджоникидзе, 56 фран'
ic| name: '!Якутск ТЦ "Центральный" фран'
ic| name: 'Адыгея ТЦ "Мега"'
ic| name: 'Балашиха ТРК "Октябрь-Киномир"'
ic| name: 'Волжский ТЦ "Волга Молл"'
ic| name: 'Вологда ТРЦ "Мармелад"'
ic| name: 'Воронеж (Плехановская, 13)'
ic| name: 'Воронеж ТРЦ "Максимир"'
ic| name: 'Воронеж ТРЦ Сити-Парк "Град"'
ic| name: 'Выездная Торговля'
ic| name: 'Жуковский ул. Чкалова 39м?'
ic| name: 'Жуковский ул. Чкалова 39м²'
ic| name: 'Интернет-магазин ЧС'
ic| name: 'Казань ТЦ "Бехетле"'
ic| name: 'Казань ТЦ "ПаркХаус" II'
ic| name: 'Калуга ТРЦ "XXI век"'
ic| name: 'Коломна ТЦ "Рио"'
ic| name: 'Красноярск ТЦ "Взлетка Плаза"'
ic| name: 'Красноярск ТЦ "Июнь"'
ic| name: 'Курск ТЦ "Пушкинский"'
ic| name: 'Москва "Распродажа"'
ic| name: 'Москва МТРЦ "Афи Молл"'
ic| name: 'Москва Магазин С21'
ic| name: 'Москва ТК "Буденовский" (пав.А2)'
ic| name: 'Москва ТК "Буденовский" (пав.К7)'
ic| name: 'Москва ТРК "Атриум"'
ic| name: 'Москва ТЦ "Ареал" (Беляево)'
ic| name: 'Москва 

In [7]:
import lightgbm as lgb

In [21]:
matrix.shape

(6639294, 60)

In [3]:
# matrix_11128050_59_bak = matrix.copy()

In [22]:
matrix = matrix_11128050_59_bak.copy()

In [24]:
matrix.shape

(11128050, 59)

In [15]:
# **********趋势特征 delta2_cnt_month_lag ********************
group = matrix.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_shop_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['shop_id', 'item_id'], how='left')

group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_date_shop_item_avg_cnt_month']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'qushi_date_shop_item_avg_cnt_month')
for i in lags:
    matrix['delta2_cnt_month_lag_'+str(i)] = (matrix['qushi_date_shop_item_avg_cnt_month_lag_' + str(i)] - matrix['qushi_shop_item_avg_cnt_month']) / matrix['qushi_shop_item_avg_cnt_month']

def select_trend3(row):
    for i in lags:
        if pd.notnull(row['delta2_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta2_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta2_cnt_month_lag'] = matrix.apply(select_trend3, axis=1)
matrix['delta2_cnt_month_lag'] = matrix['delta2_cnt_month_lag'].astype(np.float16)

features_to_drop = ['qushi_shop_item_avg_cnt_month','qushi_date_shop_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['qushi_date_shop_item_avg_cnt_month_lag_'+str(i)]
    features_to_drop += ['delta2_cnt_month_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

In [25]:
matrix.shape

(11128050, 59)

In [26]:
# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix.date_block_num > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)


In [27]:
matrix.shape

(6639294, 59)

# 训练

In [29]:
"""建模"""
trainData = matrix[matrix['date_block_num'] < 33]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

In [30]:
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 8,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9530
[LightGBM] [Info] Number of data points in the train set: 6186922, number of used features: 58
[LightGBM] [Info] Start training from score 0.288852
[1]	training's rmse: 1.18425	valid_1's rmse: 1.13338
[2]	training's rmse: 1.17976	valid_1's rmse: 1.13011
[3]	training's rmse: 1.17546	valid_1's rmse: 1.12713
[4]	training's rmse: 1.17052	valid_1's rmse: 1.12329
[5]	training's rmse: 1.1668	valid_1's rmse: 1.12058
[6]	training's rmse: 1.16227	valid_1's rmse: 1.11729
[7]	training's rmse: 1.15891	valid_1's rmse: 1.11487
[8]	training's rmse: 1.15433	valid_1's rmse: 1.11151
[9]	training's rmse: 1.15038	valid_1's rmse: 1.10895
[10]	training's rmse: 1.14709	valid_1's rmse: 1.10668
[11]	training's rmse: 1.14398	valid_1's rmse: 1.10422
[12]	training's rmse: 1.14004	valid_1's rmse: 1.10131
[13]	training's rmse: 1.13534	valid_1's rmse: 1.0978
[14]	

[138]	training's rmse: 0.894445	valid_1's rmse: 0.935813
[139]	training's rmse: 0.893506	valid_1's rmse: 0.935336
[140]	training's rmse: 0.892733	valid_1's rmse: 0.934967
[141]	training's rmse: 0.892218	valid_1's rmse: 0.934689
[142]	training's rmse: 0.891553	valid_1's rmse: 0.934405
[143]	training's rmse: 0.890871	valid_1's rmse: 0.934083
[144]	training's rmse: 0.890077	valid_1's rmse: 0.933788
[145]	training's rmse: 0.889589	valid_1's rmse: 0.933517
[146]	training's rmse: 0.888655	valid_1's rmse: 0.93287
[147]	training's rmse: 0.888041	valid_1's rmse: 0.932573
[148]	training's rmse: 0.88726	valid_1's rmse: 0.932187
[149]	training's rmse: 0.886432	valid_1's rmse: 0.931708
[150]	training's rmse: 0.885453	valid_1's rmse: 0.931134
[151]	training's rmse: 0.884945	valid_1's rmse: 0.930857
[152]	training's rmse: 0.884234	valid_1's rmse: 0.930515
[153]	training's rmse: 0.883776	valid_1's rmse: 0.930219
[154]	training's rmse: 0.882949	valid_1's rmse: 0.929657
[155]	training's rmse: 0.882056	v

[280]	training's rmse: 0.833711	valid_1's rmse: 0.908226
[281]	training's rmse: 0.833467	valid_1's rmse: 0.908161
[282]	training's rmse: 0.833263	valid_1's rmse: 0.908128
[283]	training's rmse: 0.833029	valid_1's rmse: 0.908049
[284]	training's rmse: 0.832921	valid_1's rmse: 0.907976
[285]	training's rmse: 0.83271	valid_1's rmse: 0.907944
[286]	training's rmse: 0.832541	valid_1's rmse: 0.90789
[287]	training's rmse: 0.832362	valid_1's rmse: 0.90786
[288]	training's rmse: 0.832133	valid_1's rmse: 0.907804
[289]	training's rmse: 0.831985	valid_1's rmse: 0.90773
[290]	training's rmse: 0.831838	valid_1's rmse: 0.90771
[291]	training's rmse: 0.831478	valid_1's rmse: 0.907594
[292]	training's rmse: 0.831349	valid_1's rmse: 0.907557
[293]	training's rmse: 0.831168	valid_1's rmse: 0.907562
[294]	training's rmse: 0.830833	valid_1's rmse: 0.907468
[295]	training's rmse: 0.830579	valid_1's rmse: 0.907415
[296]	training's rmse: 0.830416	valid_1's rmse: 0.907408
[297]	training's rmse: 0.830276	vali

[399]	training's rmse: 0.813967	valid_1's rmse: 0.904449
[400]	training's rmse: 0.813882	valid_1's rmse: 0.904441
[401]	training's rmse: 0.813749	valid_1's rmse: 0.904433
[402]	training's rmse: 0.813622	valid_1's rmse: 0.90441
[403]	training's rmse: 0.813476	valid_1's rmse: 0.904386
[404]	training's rmse: 0.813386	valid_1's rmse: 0.904361
[405]	training's rmse: 0.813294	valid_1's rmse: 0.9044
[406]	training's rmse: 0.813156	valid_1's rmse: 0.904442
[407]	training's rmse: 0.813094	valid_1's rmse: 0.904439
[408]	training's rmse: 0.812947	valid_1's rmse: 0.904437
[409]	training's rmse: 0.812849	valid_1's rmse: 0.904389
[410]	training's rmse: 0.81275	valid_1's rmse: 0.904452
[411]	training's rmse: 0.812602	valid_1's rmse: 0.904462
[412]	training's rmse: 0.812516	valid_1's rmse: 0.904485
[413]	training's rmse: 0.812391	valid_1's rmse: 0.904503
[414]	training's rmse: 0.812239	valid_1's rmse: 0.904453
[415]	training's rmse: 0.812109	valid_1's rmse: 0.904341
[416]	training's rmse: 0.812011	val

[504]	training's rmse: 0.803025	valid_1's rmse: 0.903522
[505]	training's rmse: 0.802943	valid_1's rmse: 0.903511
[506]	training's rmse: 0.802887	valid_1's rmse: 0.903547
[507]	training's rmse: 0.80281	valid_1's rmse: 0.903487
[508]	training's rmse: 0.802687	valid_1's rmse: 0.903455
[509]	training's rmse: 0.802611	valid_1's rmse: 0.903432
[510]	training's rmse: 0.802498	valid_1's rmse: 0.903452
[511]	training's rmse: 0.80244	valid_1's rmse: 0.903442
[512]	training's rmse: 0.802315	valid_1's rmse: 0.903433
[513]	training's rmse: 0.802205	valid_1's rmse: 0.903429
[514]	training's rmse: 0.802123	valid_1's rmse: 0.903443
[515]	training's rmse: 0.802014	valid_1's rmse: 0.903458
[516]	training's rmse: 0.80192	valid_1's rmse: 0.903473
[517]	training's rmse: 0.801827	valid_1's rmse: 0.903573
[518]	training's rmse: 0.801726	valid_1's rmse: 0.90347
[519]	training's rmse: 0.801692	valid_1's rmse: 0.903467
[520]	training's rmse: 0.801652	valid_1's rmse: 0.903475
[521]	training's rmse: 0.801602	val

[591]	training's rmse: 0.796121	valid_1's rmse: 0.90303
[592]	training's rmse: 0.796027	valid_1's rmse: 0.903046
[593]	training's rmse: 0.795959	valid_1's rmse: 0.903055
[594]	training's rmse: 0.795691	valid_1's rmse: 0.902886
[595]	training's rmse: 0.795615	valid_1's rmse: 0.902892
[596]	training's rmse: 0.795554	valid_1's rmse: 0.902898
[597]	training's rmse: 0.795498	valid_1's rmse: 0.902896
[598]	training's rmse: 0.795465	valid_1's rmse: 0.902924
[599]	training's rmse: 0.795281	valid_1's rmse: 0.902925
[600]	training's rmse: 0.795216	valid_1's rmse: 0.902928
[601]	training's rmse: 0.795166	valid_1's rmse: 0.902952
[602]	training's rmse: 0.795094	valid_1's rmse: 0.902942
[603]	training's rmse: 0.795053	valid_1's rmse: 0.902949
[604]	training's rmse: 0.794998	valid_1's rmse: 0.90295
[605]	training's rmse: 0.794955	valid_1's rmse: 0.902946
[606]	training's rmse: 0.794813	valid_1's rmse: 0.90292
[607]	training's rmse: 0.794762	valid_1's rmse: 0.90291
[608]	training's rmse: 0.79465	vali

[676]	training's rmse: 0.790218	valid_1's rmse: 0.902566
[677]	training's rmse: 0.790157	valid_1's rmse: 0.902564
[678]	training's rmse: 0.790106	valid_1's rmse: 0.90255
[679]	training's rmse: 0.790058	valid_1's rmse: 0.902551
[680]	training's rmse: 0.789995	valid_1's rmse: 0.902566
[681]	training's rmse: 0.789897	valid_1's rmse: 0.90253
[682]	training's rmse: 0.789839	valid_1's rmse: 0.902505
[683]	training's rmse: 0.789778	valid_1's rmse: 0.902514
[684]	training's rmse: 0.789727	valid_1's rmse: 0.902533
[685]	training's rmse: 0.789686	valid_1's rmse: 0.902563
[686]	training's rmse: 0.789618	valid_1's rmse: 0.902574
[687]	training's rmse: 0.789589	valid_1's rmse: 0.902583
[688]	training's rmse: 0.789546	valid_1's rmse: 0.902581
[689]	training's rmse: 0.789506	valid_1's rmse: 0.902552
[690]	training's rmse: 0.789467	valid_1's rmse: 0.90256
[691]	training's rmse: 0.789438	valid_1's rmse: 0.902551
[692]	training's rmse: 0.789375	valid_1's rmse: 0.902517
[693]	training's rmse: 0.78931	val

[757]	training's rmse: 0.785813	valid_1's rmse: 0.90283
[758]	training's rmse: 0.785771	valid_1's rmse: 0.902829
[759]	training's rmse: 0.785706	valid_1's rmse: 0.902828
[760]	training's rmse: 0.785666	valid_1's rmse: 0.902821
[761]	training's rmse: 0.785583	valid_1's rmse: 0.902846
[762]	training's rmse: 0.785546	valid_1's rmse: 0.902841
[763]	training's rmse: 0.785503	valid_1's rmse: 0.902865
[764]	training's rmse: 0.785462	valid_1's rmse: 0.902856
[765]	training's rmse: 0.78527	valid_1's rmse: 0.903028
[766]	training's rmse: 0.785231	valid_1's rmse: 0.903034
[767]	training's rmse: 0.785186	valid_1's rmse: 0.903009
[768]	training's rmse: 0.785146	valid_1's rmse: 0.903011
[769]	training's rmse: 0.785103	valid_1's rmse: 0.902995
[770]	training's rmse: 0.785066	valid_1's rmse: 0.902997
[771]	training's rmse: 0.784912	valid_1's rmse: 0.903001
[772]	training's rmse: 0.784786	valid_1's rmse: 0.902982
[773]	training's rmse: 0.784753	valid_1's rmse: 0.902996
[774]	training's rmse: 0.784727	v

[843]	training's rmse: 0.780868	valid_1's rmse: 0.902681
[844]	training's rmse: 0.780818	valid_1's rmse: 0.902689
[845]	training's rmse: 0.780772	valid_1's rmse: 0.902709
[846]	training's rmse: 0.780699	valid_1's rmse: 0.902685
[847]	training's rmse: 0.780673	valid_1's rmse: 0.902685
[848]	training's rmse: 0.780614	valid_1's rmse: 0.902678
[849]	training's rmse: 0.780583	valid_1's rmse: 0.902685
[850]	training's rmse: 0.78054	valid_1's rmse: 0.902678
[851]	training's rmse: 0.780513	valid_1's rmse: 0.902672
[852]	training's rmse: 0.780432	valid_1's rmse: 0.902683
[853]	training's rmse: 0.780405	valid_1's rmse: 0.90268
[854]	training's rmse: 0.780353	valid_1's rmse: 0.90267
[855]	training's rmse: 0.78033	valid_1's rmse: 0.902675
[856]	training's rmse: 0.780305	valid_1's rmse: 0.90268
[857]	training's rmse: 0.780275	valid_1's rmse: 0.902655
[858]	training's rmse: 0.780196	valid_1's rmse: 0.902656
[859]	training's rmse: 0.780142	valid_1's rmse: 0.90265
[860]	training's rmse: 0.780093	valid

[922]	training's rmse: 0.777635	valid_1's rmse: 0.902416
[923]	training's rmse: 0.777562	valid_1's rmse: 0.902376
[924]	training's rmse: 0.777511	valid_1's rmse: 0.902359
[925]	training's rmse: 0.777441	valid_1's rmse: 0.902364
[926]	training's rmse: 0.777407	valid_1's rmse: 0.902383
[927]	training's rmse: 0.777377	valid_1's rmse: 0.902403
[928]	training's rmse: 0.777342	valid_1's rmse: 0.902414
[929]	training's rmse: 0.7773	valid_1's rmse: 0.902385
[930]	training's rmse: 0.777156	valid_1's rmse: 0.902516
[931]	training's rmse: 0.777118	valid_1's rmse: 0.902524
[932]	training's rmse: 0.777063	valid_1's rmse: 0.902534
[933]	training's rmse: 0.777018	valid_1's rmse: 0.90256
[934]	training's rmse: 0.776982	valid_1's rmse: 0.902557
[935]	training's rmse: 0.776933	valid_1's rmse: 0.902563
[936]	training's rmse: 0.776896	valid_1's rmse: 0.90257
[937]	training's rmse: 0.776863	valid_1's rmse: 0.902569
[938]	training's rmse: 0.776805	valid_1's rmse: 0.902577
[939]	training's rmse: 0.776764	val

# 训练33个月

In [32]:
"""建模"""
trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

train_data = lgb.Dataset(data=X_train, label=label_train)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 920,
    'max_depth': 8,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9550
[LightGBM] [Info] Number of data points in the train set: 6425094, number of used features: 58
[LightGBM] [Info] Start training from score 0.287729
[1]	training's rmse: 1.18242
[2]	training's rmse: 1.178
[3]	training's rmse: 1.17375
[4]	training's rmse: 1.16888
[5]	training's rmse: 1.16521
[6]	training's rmse: 1.16073
[7]	training's rmse: 1.15741
[8]	training's rmse: 1.15289
[9]	training's rmse: 1.14903
[10]	training's rmse: 1.14578
[11]	training's rmse: 1.14268
[12]	training's rmse: 1.13879
[13]	training's rmse: 1.13415
[14]	training's rmse: 1.12997
[15]	training's rmse: 1.12586
[16]	training's rmse: 1.12209
[17]	training's rmse: 1.11835
[18]	training's rmse: 1.11454
[19]	training's rmse: 1.11147
[20]	training's rmse: 1.10846
[21]	training's rmse: 1.1051
[22]	training's rmse: 1.10272
[23]	training's rmse: 1.09997
[24]	training's rm

[243]	training's rmse: 0.845795
[244]	training's rmse: 0.845451
[245]	training's rmse: 0.845116
[246]	training's rmse: 0.844896
[247]	training's rmse: 0.844585
[248]	training's rmse: 0.844216
[249]	training's rmse: 0.843994
[250]	training's rmse: 0.843728
[251]	training's rmse: 0.84342
[252]	training's rmse: 0.843188
[253]	training's rmse: 0.842941
[254]	training's rmse: 0.842765
[255]	training's rmse: 0.842619
[256]	training's rmse: 0.842367
[257]	training's rmse: 0.842172
[258]	training's rmse: 0.841992
[259]	training's rmse: 0.841739
[260]	training's rmse: 0.841517
[261]	training's rmse: 0.841312
[262]	training's rmse: 0.840974
[263]	training's rmse: 0.840794
[264]	training's rmse: 0.840608
[265]	training's rmse: 0.840274
[266]	training's rmse: 0.840041
[267]	training's rmse: 0.839875
[268]	training's rmse: 0.839619
[269]	training's rmse: 0.839326
[270]	training's rmse: 0.839115
[271]	training's rmse: 0.838934
[272]	training's rmse: 0.838788
[273]	training's rmse: 0.838555
[274]	tra

[437]	training's rmse: 0.812892
[438]	training's rmse: 0.812799
[439]	training's rmse: 0.812727
[440]	training's rmse: 0.812625
[441]	training's rmse: 0.812558
[442]	training's rmse: 0.812486
[443]	training's rmse: 0.812281
[444]	training's rmse: 0.812203
[445]	training's rmse: 0.812096
[446]	training's rmse: 0.811977
[447]	training's rmse: 0.811803
[448]	training's rmse: 0.811734
[449]	training's rmse: 0.811618
[450]	training's rmse: 0.81155
[451]	training's rmse: 0.811447
[452]	training's rmse: 0.811364
[453]	training's rmse: 0.811285
[454]	training's rmse: 0.811184
[455]	training's rmse: 0.811062
[456]	training's rmse: 0.810971
[457]	training's rmse: 0.810782
[458]	training's rmse: 0.810662
[459]	training's rmse: 0.810564
[460]	training's rmse: 0.81048
[461]	training's rmse: 0.810425
[462]	training's rmse: 0.810309
[463]	training's rmse: 0.810228
[464]	training's rmse: 0.810146
[465]	training's rmse: 0.810062
[466]	training's rmse: 0.809923
[467]	training's rmse: 0.809864
[468]	trai

[572]	training's rmse: 0.801174
[573]	training's rmse: 0.801102
[574]	training's rmse: 0.801064
[575]	training's rmse: 0.800977
[576]	training's rmse: 0.800929
[577]	training's rmse: 0.800569
[578]	training's rmse: 0.800516
[579]	training's rmse: 0.8004
[580]	training's rmse: 0.800353
[581]	training's rmse: 0.800289
[582]	training's rmse: 0.800193
[583]	training's rmse: 0.800137
[584]	training's rmse: 0.800086
[585]	training's rmse: 0.799976
[586]	training's rmse: 0.799919
[587]	training's rmse: 0.799865
[588]	training's rmse: 0.799736
[589]	training's rmse: 0.799665
[590]	training's rmse: 0.799639
[591]	training's rmse: 0.799585
[592]	training's rmse: 0.799513
[593]	training's rmse: 0.799443
[594]	training's rmse: 0.799207
[595]	training's rmse: 0.799138
[596]	training's rmse: 0.799076
[597]	training's rmse: 0.799008
[598]	training's rmse: 0.798973
[599]	training's rmse: 0.79882
[600]	training's rmse: 0.798734
[601]	training's rmse: 0.798689
[602]	training's rmse: 0.798605
[603]	train

[698]	training's rmse: 0.792842
[699]	training's rmse: 0.79278
[700]	training's rmse: 0.792757
[701]	training's rmse: 0.792719
[702]	training's rmse: 0.792679
[703]	training's rmse: 0.792632
[704]	training's rmse: 0.79258
[705]	training's rmse: 0.792438
[706]	training's rmse: 0.792413
[707]	training's rmse: 0.792325
[708]	training's rmse: 0.792289
[709]	training's rmse: 0.792234
[710]	training's rmse: 0.792193
[711]	training's rmse: 0.792116
[712]	training's rmse: 0.792056
[713]	training's rmse: 0.791975
[714]	training's rmse: 0.791933
[715]	training's rmse: 0.791865
[716]	training's rmse: 0.791818
[717]	training's rmse: 0.791745
[718]	training's rmse: 0.79162
[719]	training's rmse: 0.791569
[720]	training's rmse: 0.791515
[721]	training's rmse: 0.791486
[722]	training's rmse: 0.791439
[723]	training's rmse: 0.791404
[724]	training's rmse: 0.791316
[725]	training's rmse: 0.791288
[726]	training's rmse: 0.791225
[727]	training's rmse: 0.791188
[728]	training's rmse: 0.791134
[729]	train

[812]	training's rmse: 0.786524
[813]	training's rmse: 0.786481
[814]	training's rmse: 0.786385
[815]	training's rmse: 0.786348
[816]	training's rmse: 0.786323
[817]	training's rmse: 0.786273
[818]	training's rmse: 0.786252
[819]	training's rmse: 0.786082
[820]	training's rmse: 0.786038
[821]	training's rmse: 0.786003
[822]	training's rmse: 0.785914
[823]	training's rmse: 0.785876
[824]	training's rmse: 0.785826
[825]	training's rmse: 0.785787
[826]	training's rmse: 0.785764
[827]	training's rmse: 0.78572
[828]	training's rmse: 0.785617
[829]	training's rmse: 0.785573
[830]	training's rmse: 0.785538
[831]	training's rmse: 0.785516
[832]	training's rmse: 0.785465
[833]	training's rmse: 0.785434
[834]	training's rmse: 0.785382
[835]	training's rmse: 0.785345
[836]	training's rmse: 0.785276
[837]	training's rmse: 0.785221
[838]	training's rmse: 0.785187
[839]	training's rmse: 0.785131
[840]	training's rmse: 0.785077
[841]	training's rmse: 0.785038
[842]	training's rmse: 0.784929
[843]	tra

In [33]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
X_test = testData.drop('item_cnt_month', axis=1)

# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)
submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub1.csv', index=False)