In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)   # 去掉第一层索引
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)



# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)



# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)] = (matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')


# 月销量（商店类型）
group = matrix.groupby(['date_block_num','shop_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shoptype_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shoptype_avg_item_cnt')
matrix.drop('date_shoptype_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店类型）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_shoptype_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_shoptype_avg_item_cnt')
matrix.drop('date_item_shoptype_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店-商品）
group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shopitem_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shopitem_avg_item_cnt')
matrix.drop('date_shopitem_avg_item_cnt', axis=1, inplace=True)



In [13]:
matrix.shape

(11128050, 87)

In [4]:
# matrix_bak = matrix.copy()

In [14]:
# matrix = matrix_bak.copy()

In [16]:
matrix_bak.shape

(11128050, 74)

In [15]:
matrix.shape

(11128050, 74)

In [17]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,date_avg_item_cnt_lag_1,...,date_item_shoptype_avg_item_cnt_lag_1,date_item_shoptype_avg_item_cnt_lag_2,date_item_shoptype_avg_item_cnt_lag_3,date_item_shoptype_avg_item_cnt_lag_6,date_item_shoptype_avg_item_cnt_lag_12,date_shopitem_avg_item_cnt_lag_1,date_shopitem_avg_item_cnt_lag_2,date_shopitem_avg_item_cnt_lag_3,date_shopitem_avg_item_cnt_lag_6,date_shopitem_avg_item_cnt_lag_12
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,,,,,
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,,,,,
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,,,,,
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,,,,,
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,0.258545,...,0.068970,0.033325,0.034485,0.548340,,1.0,0.0,0.0,0.0,
11128046,34,45,16188,0.0,64,14,42,21,4,0.258545,...,0.000000,0.099976,,,,0.0,0.0,,,
11128047,34,45,15757,0.0,55,13,2,21,4,0.258545,...,0.137939,0.066650,0.137939,0.129028,0.205933,0.0,0.0,0.0,0.0,0.0
11128048,34,45,19648,0.0,40,11,4,21,4,0.258545,...,0.034485,0.033325,0.172363,0.096802,,0.0,0.0,0.0,0.0,


In [18]:
# 趋势特征
group = matrix.groupby('item_id').agg({'item_cnt_month': 'mean'})
group.columns = ['trend_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

In [21]:
matrix.shape

(11128050, 75)

In [22]:
group = matrix.groupby(['date_block_num','item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['trend_date_item_avg_cnt_month']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

In [25]:
matrix.shape

(11128050, 76)

In [26]:
# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'trend_date_item_avg_cnt_month')
for i in lags:
    matrix['delta_cnt_month_lag_'+str(i)] = (matrix['trend_date_item_avg_cnt_month_lag_' + str(i)] - matrix['trend_item_avg_cnt_month']) / matrix['trend_item_avg_cnt_month']

def select_trend2(row):
    for i in lags:
        if pd.notnull(row['delta_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

In [29]:
matrix.shape

(11128050, 90)

In [30]:
matrix['delta_cnt_month_lag'] = matrix.apply(select_trend2, axis=1)
matrix['delta_cnt_month_lag'] = matrix['delta_cnt_month_lag'].astype(np.float16)

In [31]:
matrix.shape

(11128050, 91)

In [43]:
matrix.shape

(11128050, 87)

In [44]:
del_cols = [ 'trend_date_item_avg_cnt_month_lag_2',
       'trend_date_item_avg_cnt_month_lag_3',
       'trend_date_item_avg_cnt_month_lag_4',
       'trend_date_item_avg_cnt_month_lag_5',
       'trend_date_item_avg_cnt_month_lag_6',
       'trend_date_item_avg_cnt_month_lag_12', 'delta_cnt_month_lag_2',
       'delta_cnt_month_lag_3', 'delta_cnt_month_lag_4',
       'delta_cnt_month_lag_5', 'delta_cnt_month_lag_6',
       'delta_cnt_month_lag_12']
matrix.drop(del_cols, axis=1, inplace=True)

In [46]:
matrix.shape

(11128050, 75)

In [47]:
# matrix_11128050_75_bak = matrix.copy()

In [48]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'item_category_id', 'cat_type_code', 'cat_subtype_code',
       'shop_city_code', 'shop_type_code', 'date_avg_item_cnt_lag_1',
       'date_avg_item_cnt_lag_2', 'date_avg_item_cnt_lag_3',
       'date_avg_item_cnt_lag_6', 'date_avg_item_cnt_lag_12',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_cat_avg_item_cnt_lag_2',
       'date_cat_avg_item_cnt_lag_3', 'date_cat_avg_item_cnt_lag_6',
       'date_cat_avg_item_cnt_lag_12', 'date_cat_shop_avg_item_cnt_lag_1',
       'date_cat_shop_avg_item_cnt_lag_2', 'date_cat_shop_avg_item_cnt_lag_3',
       'date_cat_shop_avg_ite

In [54]:
if 'item_cnt_month_lag_3' in matrix.columns:
    print('yes')

yes


In [52]:
matrix = lag_features(matrix, [1,2,3,4,5,6,12], 'item_cnt_month')

In [53]:
matrix.shape

(11128050, 82)

In [55]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'item_category_id', 'cat_type_code', 'cat_subtype_code',
       'shop_city_code', 'shop_type_code', 'date_avg_item_cnt_lag_1',
       'date_avg_item_cnt_lag_2', 'date_avg_item_cnt_lag_3',
       'date_avg_item_cnt_lag_6', 'date_avg_item_cnt_lag_12',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_cat_avg_item_cnt_lag_2',
       'date_cat_avg_item_cnt_lag_3', 'date_cat_avg_item_cnt_lag_6',
       'date_cat_avg_item_cnt_lag_12', 'date_cat_shop_avg_item_cnt_lag_1',
       'date_cat_shop_avg_item_cnt_lag_2', 'date_cat_shop_avg_item_cnt_lag_3',
       'date_cat_shop_avg_ite

In [56]:
matrix.drop(['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'item_cnt_month_lag_4', 'item_cnt_month_lag_5', 'item_cnt_month_lag_6',
       'item_cnt_month_lag_12'], axis=1, inplace=True)

In [57]:
matrix.shape

(11128050, 75)

In [58]:
# 趋势特征 item_cnt_month 
group = matrix.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_shop_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['shop_id', 'item_id'], how='left')

group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_date_shop_item_avg_cnt_month']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'qushi_date_shop_item_avg_cnt_month')
for i in lags:
    matrix['delta2_cnt_month_lag_'+str(i)] = (matrix['qushi_date_shop_item_avg_cnt_month_lag_' + str(i)] - matrix['qushi_shop_item_avg_cnt_month']) / matrix['qushi_shop_item_avg_cnt_month']

def select_trend3(row):
    for i in lags:
        if pd.notnull(row['delta2_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta2_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta2_cnt_month_lag'] = matrix.apply(select_trend3, axis=1)
matrix['delta2_cnt_month_lag'] = matrix['delta2_cnt_month_lag'].astype(np.float16)

features_to_drop = ['qushi_shop_item_avg_cnt_month','qushi_date_shop_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['delta2_cnt_month_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

In [59]:
matrix.shape

(11128050, 83)

In [60]:
matrix_11128050_83_bak = matrix.copy()

In [77]:
matrix = matrix_11128050_83_bak.copy()

In [78]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'item_category_id', 'cat_type_code', 'cat_subtype_code',
       'shop_city_code', 'shop_type_code', 'date_avg_item_cnt_lag_1',
       'date_avg_item_cnt_lag_2', 'date_avg_item_cnt_lag_3',
       'date_avg_item_cnt_lag_6', 'date_avg_item_cnt_lag_12',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_cat_avg_item_cnt_lag_2',
       'date_cat_avg_item_cnt_lag_3', 'date_cat_avg_item_cnt_lag_6',
       'date_cat_avg_item_cnt_lag_12', 'date_cat_shop_avg_item_cnt_lag_1',
       'date_cat_shop_avg_item_cnt_lag_2', 'date_cat_shop_avg_item_cnt_lag_3',
       'date_cat_shop_avg_ite

In [61]:
# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

In [67]:
matrix.shape

(6639294, 83)

In [68]:
"""建模"""

trainData = matrix[matrix['date_block_num'] < 33]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 7,
    'num_leaves': 180,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.005,
    'bagging_fraction': 0.7,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13242
[LightGBM] [Info] Number of data points in the train set: 6186922, number of used features: 82
[LightGBM] [Info] Start training from score 0.288852
[1]	training's rmse: 1.18554	valid_1's rmse: 1.13442
[2]	training's rmse: 1.18344	valid_1's rmse: 1.13293
[3]	training's rmse: 1.18094	valid_1's rmse: 1.13112
[4]	training's rmse: 1.17898	valid_1's rmse: 1.12963
[5]	training's rmse: 1.17699	valid_1's rmse: 1.12825
[6]	training's rmse: 1.17475	valid_1's rmse: 1.12664
[7]	training's rmse: 1.17243	valid_1's rmse: 1.125
[8]	training's rmse: 1.17009	valid_1's rmse: 1.12333
[9]	training's rmse: 1.16795	valid_1's rmse: 1.12188
[10]	training's rmse: 1.16566	valid_1's rmse: 1.12016
[11]	training's rmse: 1.16304	valid_1's rmse: 1.11796
[12]	training's rmse: 1.16081	valid_1's rmse: 1.11642
[13]	training's rmse: 1.15868	valid_1's rmse: 1.11493
[14]

[71]	training's rmse: 1.05456	valid_1's rmse: 1.04064
[72]	training's rmse: 1.05308	valid_1's rmse: 1.03966
[73]	training's rmse: 1.05159	valid_1's rmse: 1.03866
[74]	training's rmse: 1.04973	valid_1's rmse: 1.03703
[75]	training's rmse: 1.04823	valid_1's rmse: 1.03601
[76]	training's rmse: 1.04711	valid_1's rmse: 1.03521
[77]	training's rmse: 1.04566	valid_1's rmse: 1.03425
[78]	training's rmse: 1.04386	valid_1's rmse: 1.03281
[79]	training's rmse: 1.04235	valid_1's rmse: 1.0317
[80]	training's rmse: 1.04094	valid_1's rmse: 1.03077
[81]	training's rmse: 1.03984	valid_1's rmse: 1.02995
[82]	training's rmse: 1.03857	valid_1's rmse: 1.02905
[83]	training's rmse: 1.03732	valid_1's rmse: 1.02825
[84]	training's rmse: 1.03602	valid_1's rmse: 1.02742
[85]	training's rmse: 1.03465	valid_1's rmse: 1.02656
[86]	training's rmse: 1.03326	valid_1's rmse: 1.02561
[87]	training's rmse: 1.03197	valid_1's rmse: 1.02475
[88]	training's rmse: 1.03063	valid_1's rmse: 1.02384
[89]	training's rmse: 1.02934

[144]	training's rmse: 0.967971	valid_1's rmse: 0.981467
[145]	training's rmse: 0.96719	valid_1's rmse: 0.981015
[146]	training's rmse: 0.966327	valid_1's rmse: 0.980519
[147]	training's rmse: 0.965434	valid_1's rmse: 0.979917
[148]	training's rmse: 0.964549	valid_1's rmse: 0.979371
[149]	training's rmse: 0.963631	valid_1's rmse: 0.978707
[150]	training's rmse: 0.962778	valid_1's rmse: 0.978226
[151]	training's rmse: 0.961594	valid_1's rmse: 0.977294
[152]	training's rmse: 0.960755	valid_1's rmse: 0.976831
[153]	training's rmse: 0.959696	valid_1's rmse: 0.976025
[154]	training's rmse: 0.958852	valid_1's rmse: 0.975489
[155]	training's rmse: 0.958199	valid_1's rmse: 0.975086
[156]	training's rmse: 0.957197	valid_1's rmse: 0.974371
[157]	training's rmse: 0.95615	valid_1's rmse: 0.973628
[158]	training's rmse: 0.954923	valid_1's rmse: 0.972662
[159]	training's rmse: 0.954172	valid_1's rmse: 0.972223
[160]	training's rmse: 0.953361	valid_1's rmse: 0.971736
[161]	training's rmse: 0.952547	v

[217]	training's rmse: 0.911812	valid_1's rmse: 0.945275
[218]	training's rmse: 0.911296	valid_1's rmse: 0.945041
[219]	training's rmse: 0.910795	valid_1's rmse: 0.944779
[220]	training's rmse: 0.910155	valid_1's rmse: 0.944375
[221]	training's rmse: 0.909623	valid_1's rmse: 0.944111
[222]	training's rmse: 0.909183	valid_1's rmse: 0.943881
[223]	training's rmse: 0.908663	valid_1's rmse: 0.943628
[224]	training's rmse: 0.908168	valid_1's rmse: 0.943407
[225]	training's rmse: 0.907643	valid_1's rmse: 0.943148
[226]	training's rmse: 0.907101	valid_1's rmse: 0.942891
[227]	training's rmse: 0.906614	valid_1's rmse: 0.942633
[228]	training's rmse: 0.906067	valid_1's rmse: 0.942337
[229]	training's rmse: 0.905651	valid_1's rmse: 0.942127
[230]	training's rmse: 0.905128	valid_1's rmse: 0.941863
[231]	training's rmse: 0.904452	valid_1's rmse: 0.941378
[232]	training's rmse: 0.903979	valid_1's rmse: 0.941133
[233]	training's rmse: 0.903564	valid_1's rmse: 0.940941
[234]	training's rmse: 0.90312	

[284]	training's rmse: 0.880736	valid_1's rmse: 0.928901
[285]	training's rmse: 0.880331	valid_1's rmse: 0.928675
[286]	training's rmse: 0.880067	valid_1's rmse: 0.92853
[287]	training's rmse: 0.879677	valid_1's rmse: 0.928376
[288]	training's rmse: 0.87924	valid_1's rmse: 0.928115
[289]	training's rmse: 0.878916	valid_1's rmse: 0.928006
[290]	training's rmse: 0.878576	valid_1's rmse: 0.927875
[291]	training's rmse: 0.878027	valid_1's rmse: 0.927549
[292]	training's rmse: 0.877633	valid_1's rmse: 0.927348
[293]	training's rmse: 0.877265	valid_1's rmse: 0.927191
[294]	training's rmse: 0.876821	valid_1's rmse: 0.926948
[295]	training's rmse: 0.876496	valid_1's rmse: 0.926829
[296]	training's rmse: 0.87605	valid_1's rmse: 0.926615
[297]	training's rmse: 0.875708	valid_1's rmse: 0.926482
[298]	training's rmse: 0.875369	valid_1's rmse: 0.92636
[299]	training's rmse: 0.874933	valid_1's rmse: 0.92614
[300]	training's rmse: 0.874573	valid_1's rmse: 0.925866
[301]	training's rmse: 0.874288	vali

[355]	training's rmse: 0.85888	valid_1's rmse: 0.918963
[356]	training's rmse: 0.858644	valid_1's rmse: 0.918875
[357]	training's rmse: 0.858335	valid_1's rmse: 0.918629
[358]	training's rmse: 0.858056	valid_1's rmse: 0.918473
[359]	training's rmse: 0.857732	valid_1's rmse: 0.918304
[360]	training's rmse: 0.857367	valid_1's rmse: 0.918152
[361]	training's rmse: 0.857077	valid_1's rmse: 0.917988
[362]	training's rmse: 0.856799	valid_1's rmse: 0.91783
[363]	training's rmse: 0.856583	valid_1's rmse: 0.917746
[364]	training's rmse: 0.856368	valid_1's rmse: 0.917712
[365]	training's rmse: 0.856018	valid_1's rmse: 0.917514
[366]	training's rmse: 0.855783	valid_1's rmse: 0.917472
[367]	training's rmse: 0.855608	valid_1's rmse: 0.91745
[368]	training's rmse: 0.8553	valid_1's rmse: 0.917283
[369]	training's rmse: 0.85504	valid_1's rmse: 0.917188
[370]	training's rmse: 0.854801	valid_1's rmse: 0.9171
[371]	training's rmse: 0.854591	valid_1's rmse: 0.91705
[372]	training's rmse: 0.854406	valid_1'

[420]	training's rmse: 0.843609	valid_1's rmse: 0.913051
[421]	training's rmse: 0.843372	valid_1's rmse: 0.912875
[422]	training's rmse: 0.843226	valid_1's rmse: 0.912847
[423]	training's rmse: 0.843082	valid_1's rmse: 0.912846
[424]	training's rmse: 0.842913	valid_1's rmse: 0.912815
[425]	training's rmse: 0.842653	valid_1's rmse: 0.912682
[426]	training's rmse: 0.842355	valid_1's rmse: 0.912545
[427]	training's rmse: 0.842091	valid_1's rmse: 0.912397
[428]	training's rmse: 0.841961	valid_1's rmse: 0.912362
[429]	training's rmse: 0.841827	valid_1's rmse: 0.912364
[430]	training's rmse: 0.841577	valid_1's rmse: 0.912224
[431]	training's rmse: 0.841394	valid_1's rmse: 0.912113
[432]	training's rmse: 0.841239	valid_1's rmse: 0.912115
[433]	training's rmse: 0.841064	valid_1's rmse: 0.912034
[434]	training's rmse: 0.840899	valid_1's rmse: 0.911976
[435]	training's rmse: 0.840777	valid_1's rmse: 0.911963
[436]	training's rmse: 0.840587	valid_1's rmse: 0.91184
[437]	training's rmse: 0.840454	

[485]	training's rmse: 0.833328	valid_1's rmse: 0.909737
[486]	training's rmse: 0.833194	valid_1's rmse: 0.909721
[487]	training's rmse: 0.833003	valid_1's rmse: 0.909672
[488]	training's rmse: 0.832769	valid_1's rmse: 0.909552
[489]	training's rmse: 0.832498	valid_1's rmse: 0.909405
[490]	training's rmse: 0.832288	valid_1's rmse: 0.909249
[491]	training's rmse: 0.832106	valid_1's rmse: 0.909152
[492]	training's rmse: 0.831992	valid_1's rmse: 0.909115
[493]	training's rmse: 0.831803	valid_1's rmse: 0.909092
[494]	training's rmse: 0.831711	valid_1's rmse: 0.909086
[495]	training's rmse: 0.831566	valid_1's rmse: 0.909023
[496]	training's rmse: 0.831454	valid_1's rmse: 0.908995
[497]	training's rmse: 0.831266	valid_1's rmse: 0.908931
[498]	training's rmse: 0.831121	valid_1's rmse: 0.908901
[499]	training's rmse: 0.830933	valid_1's rmse: 0.908853
[500]	training's rmse: 0.830843	valid_1's rmse: 0.908831
[501]	training's rmse: 0.830691	valid_1's rmse: 0.908812
[502]	training's rmse: 0.830535

[549]	training's rmse: 0.824926	valid_1's rmse: 0.90729
[550]	training's rmse: 0.824825	valid_1's rmse: 0.907261
[551]	training's rmse: 0.824625	valid_1's rmse: 0.907203
[552]	training's rmse: 0.824509	valid_1's rmse: 0.907182
[553]	training's rmse: 0.824408	valid_1's rmse: 0.907179
[554]	training's rmse: 0.824337	valid_1's rmse: 0.907177
[555]	training's rmse: 0.824251	valid_1's rmse: 0.907192
[556]	training's rmse: 0.824105	valid_1's rmse: 0.907133
[557]	training's rmse: 0.823967	valid_1's rmse: 0.907095
[558]	training's rmse: 0.823885	valid_1's rmse: 0.907118
[559]	training's rmse: 0.823827	valid_1's rmse: 0.907113
[560]	training's rmse: 0.823757	valid_1's rmse: 0.907104
[561]	training's rmse: 0.823673	valid_1's rmse: 0.907136
[562]	training's rmse: 0.823587	valid_1's rmse: 0.907136
[563]	training's rmse: 0.823495	valid_1's rmse: 0.907062
[564]	training's rmse: 0.823348	valid_1's rmse: 0.90699
[565]	training's rmse: 0.823145	valid_1's rmse: 0.906879
[566]	training's rmse: 0.82306	va

[612]	training's rmse: 0.818132	valid_1's rmse: 0.90527
[613]	training's rmse: 0.817949	valid_1's rmse: 0.905218
[614]	training's rmse: 0.817894	valid_1's rmse: 0.905203
[615]	training's rmse: 0.817827	valid_1's rmse: 0.905207
[616]	training's rmse: 0.817757	valid_1's rmse: 0.905215
[617]	training's rmse: 0.817665	valid_1's rmse: 0.905163
[618]	training's rmse: 0.817601	valid_1's rmse: 0.905177
[619]	training's rmse: 0.817532	valid_1's rmse: 0.905177
[620]	training's rmse: 0.817463	valid_1's rmse: 0.905145
[621]	training's rmse: 0.817413	valid_1's rmse: 0.905135
[622]	training's rmse: 0.817267	valid_1's rmse: 0.905077
[623]	training's rmse: 0.817167	valid_1's rmse: 0.905071
[624]	training's rmse: 0.817046	valid_1's rmse: 0.905056
[625]	training's rmse: 0.816996	valid_1's rmse: 0.905044
[626]	training's rmse: 0.816923	valid_1's rmse: 0.905056
[627]	training's rmse: 0.816848	valid_1's rmse: 0.905021
[628]	training's rmse: 0.816739	valid_1's rmse: 0.905002
[629]	training's rmse: 0.816641	

[675]	training's rmse: 0.813048	valid_1's rmse: 0.904285
[676]	training's rmse: 0.812962	valid_1's rmse: 0.90428
[677]	training's rmse: 0.812833	valid_1's rmse: 0.904237
[678]	training's rmse: 0.812787	valid_1's rmse: 0.904252
[679]	training's rmse: 0.812681	valid_1's rmse: 0.904185
[680]	training's rmse: 0.812595	valid_1's rmse: 0.904208
[681]	training's rmse: 0.812471	valid_1's rmse: 0.904121
[682]	training's rmse: 0.812333	valid_1's rmse: 0.904061
[683]	training's rmse: 0.812213	valid_1's rmse: 0.904028
[684]	training's rmse: 0.812168	valid_1's rmse: 0.904028
[685]	training's rmse: 0.81212	valid_1's rmse: 0.904059
[686]	training's rmse: 0.812069	valid_1's rmse: 0.904098
[687]	training's rmse: 0.812012	valid_1's rmse: 0.904111
[688]	training's rmse: 0.811957	valid_1's rmse: 0.904146
[689]	training's rmse: 0.811878	valid_1's rmse: 0.904085
[690]	training's rmse: 0.81179	valid_1's rmse: 0.904022
[691]	training's rmse: 0.811704	valid_1's rmse: 0.903977
[692]	training's rmse: 0.811642	va

[738]	training's rmse: 0.808327	valid_1's rmse: 0.903192
[739]	training's rmse: 0.80827	valid_1's rmse: 0.903219
[740]	training's rmse: 0.808221	valid_1's rmse: 0.903233
[741]	training's rmse: 0.808102	valid_1's rmse: 0.90312
[742]	training's rmse: 0.808054	valid_1's rmse: 0.903124
[743]	training's rmse: 0.807994	valid_1's rmse: 0.903122
[744]	training's rmse: 0.80794	valid_1's rmse: 0.90315
[745]	training's rmse: 0.807892	valid_1's rmse: 0.903175
[746]	training's rmse: 0.807856	valid_1's rmse: 0.903185
[747]	training's rmse: 0.807752	valid_1's rmse: 0.903056
[748]	training's rmse: 0.807624	valid_1's rmse: 0.903029
[749]	training's rmse: 0.807541	valid_1's rmse: 0.902993
[750]	training's rmse: 0.807423	valid_1's rmse: 0.902887
[751]	training's rmse: 0.807368	valid_1's rmse: 0.902889
[752]	training's rmse: 0.807283	valid_1's rmse: 0.902831
[753]	training's rmse: 0.807235	valid_1's rmse: 0.902858
[754]	training's rmse: 0.8071	valid_1's rmse: 0.902777
[755]	training's rmse: 0.806952	valid

[801]	training's rmse: 0.803727	valid_1's rmse: 0.901615
[802]	training's rmse: 0.803673	valid_1's rmse: 0.901577
[803]	training's rmse: 0.803594	valid_1's rmse: 0.901517
[804]	training's rmse: 0.803537	valid_1's rmse: 0.901537
[805]	training's rmse: 0.803476	valid_1's rmse: 0.901448
[806]	training's rmse: 0.803432	valid_1's rmse: 0.901458
[807]	training's rmse: 0.803373	valid_1's rmse: 0.901429
[808]	training's rmse: 0.803333	valid_1's rmse: 0.901444
[809]	training's rmse: 0.803283	valid_1's rmse: 0.90144
[810]	training's rmse: 0.803231	valid_1's rmse: 0.901448
[811]	training's rmse: 0.803187	valid_1's rmse: 0.901457
[812]	training's rmse: 0.803078	valid_1's rmse: 0.901352
[813]	training's rmse: 0.803014	valid_1's rmse: 0.901326
[814]	training's rmse: 0.80298	valid_1's rmse: 0.901329
[815]	training's rmse: 0.802937	valid_1's rmse: 0.901359
[816]	training's rmse: 0.802883	valid_1's rmse: 0.901372
[817]	training's rmse: 0.802814	valid_1's rmse: 0.901364
[818]	training's rmse: 0.802769	v

[864]	training's rmse: 0.800201	valid_1's rmse: 0.9008
[865]	training's rmse: 0.800092	valid_1's rmse: 0.900748
[866]	training's rmse: 0.800019	valid_1's rmse: 0.900756
[867]	training's rmse: 0.79994	valid_1's rmse: 0.90077
[868]	training's rmse: 0.799868	valid_1's rmse: 0.90077
[869]	training's rmse: 0.799803	valid_1's rmse: 0.900748
[870]	training's rmse: 0.79974	valid_1's rmse: 0.900689
[871]	training's rmse: 0.799715	valid_1's rmse: 0.900684
[872]	training's rmse: 0.799667	valid_1's rmse: 0.900696
[873]	training's rmse: 0.799611	valid_1's rmse: 0.900708
[874]	training's rmse: 0.799544	valid_1's rmse: 0.900729
[875]	training's rmse: 0.799507	valid_1's rmse: 0.900748
[876]	training's rmse: 0.799461	valid_1's rmse: 0.900732
[877]	training's rmse: 0.799423	valid_1's rmse: 0.900737
[878]	training's rmse: 0.79938	valid_1's rmse: 0.900744
[879]	training's rmse: 0.799308	valid_1's rmse: 0.90074
[880]	training's rmse: 0.799266	valid_1's rmse: 0.90074
[881]	training's rmse: 0.799206	valid_1'

[928]	training's rmse: 0.796645	valid_1's rmse: 0.90018
[929]	training's rmse: 0.796622	valid_1's rmse: 0.900185
[930]	training's rmse: 0.79659	valid_1's rmse: 0.9002
[931]	training's rmse: 0.79655	valid_1's rmse: 0.900177
[932]	training's rmse: 0.796491	valid_1's rmse: 0.900145
[933]	training's rmse: 0.796464	valid_1's rmse: 0.900134
[934]	training's rmse: 0.796433	valid_1's rmse: 0.900141
[935]	training's rmse: 0.796403	valid_1's rmse: 0.90011
[936]	training's rmse: 0.796369	valid_1's rmse: 0.900112
[937]	training's rmse: 0.79631	valid_1's rmse: 0.900116
[938]	training's rmse: 0.79625	valid_1's rmse: 0.900086
[939]	training's rmse: 0.796221	valid_1's rmse: 0.900084
[940]	training's rmse: 0.796185	valid_1's rmse: 0.900118
[941]	training's rmse: 0.79615	valid_1's rmse: 0.900104
[942]	training's rmse: 0.796094	valid_1's rmse: 0.899994
[943]	training's rmse: 0.796035	valid_1's rmse: 0.899959
[944]	training's rmse: 0.795967	valid_1's rmse: 0.899933
[945]	training's rmse: 0.79588	valid_1's

[991]	training's rmse: 0.793483	valid_1's rmse: 0.899113
[992]	training's rmse: 0.793435	valid_1's rmse: 0.899109
[993]	training's rmse: 0.79341	valid_1's rmse: 0.899105
[994]	training's rmse: 0.793325	valid_1's rmse: 0.899071
[995]	training's rmse: 0.79329	valid_1's rmse: 0.899066
[996]	training's rmse: 0.793191	valid_1's rmse: 0.899036
[997]	training's rmse: 0.793131	valid_1's rmse: 0.899029
[998]	training's rmse: 0.793083	valid_1's rmse: 0.899019
[999]	training's rmse: 0.79306	valid_1's rmse: 0.899036
[1000]	training's rmse: 0.792979	valid_1's rmse: 0.898987


In [69]:
matrix.shape

(6639294, 83)

In [70]:
fi = lgb_model.feature_importance()
sorted_index = np.argsort(-fi)
sorted_index = sorted_index[:60]
sel_cols = list(X_train.columns[sorted_index]) + ['item_cnt_month']
matrix_sel_col = matrix[sel_cols]

In [71]:
matrix_sel_col.shape

(6639294, 61)

In [72]:
matrix_sel_col.columns

Index(['delta2_cnt_month_lag', 'delta_cnt_month_lag', 'item_category_id',
       'date_block_num', 'date_item_avg_item_cnt_lag_1', 'delta_price_lag',
       'month', 'date_cat_shop_avg_item_cnt_lag_1',
       'date_type_avg_item_cnt_lag_1', 'cat_subtype_code',
       'date_shop_avg_item_cnt_lag_1', 'item_id',
       'date_cat_avg_item_cnt_lag_1', 'date_shopitem_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_2', 'cat_type_code',
       'date_avg_item_cnt_lag_1', 'date_item_shoptype_avg_item_cnt_lag_1',
       'date_item_type_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1',
       'qushi_date_shop_item_avg_cnt_month_lag_1', 'shop_id',
       'date_type_avg_item_cnt_lag_2', 'date_cat_avg_item_cnt_lag_2',
       'item_first_sale', 'date_cat_shop_avg_item_cnt_lag_2',
       'date_shopitem_avg_item_cnt_lag_2', 'item_shop_first_sale',
       'date_item_type_avg_item_cnt_lag_2', 'date_avg_item_cnt_lag_3',
       'date_city_avg_item_cnt_lag_1', 'date_avg_item_cnt_lag_2'

In [73]:
"""建模"""

trainData = matrix_sel_col[matrix_sel_col['date_block_num'] < 33]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix_sel_col[matrix_sel_col['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 8,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.005,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9112
[LightGBM] [Info] Number of data points in the train set: 6186922, number of used features: 60
[LightGBM] [Info] Start training from score 0.288852
[1]	training's rmse: 1.18582	valid_1's rmse: 1.1348
[2]	training's rmse: 1.18313	valid_1's rmse: 1.13274
[3]	training's rmse: 1.18072	valid_1's rmse: 1.13093
[4]	training's rmse: 1.17829	valid_1's rmse: 1.12915
[5]	training's rmse: 1.17575	valid_1's rmse: 1.12727
[6]	training's rmse: 1.17307	valid_1's rmse: 1.12498
[7]	training's rmse: 1.17119	valid_1's rmse: 1.12366
[8]	training's rmse: 1.16887	valid_1's rmse: 1.12203
[9]	training's rmse: 1.1663	valid_1's rmse: 1.11991
[10]	training's rmse: 1.16395	valid_1's rmse: 1.11827
[11]	training's rmse: 1.16161	valid_1's rmse: 1.11651
[12]	training's rmse: 1.1593	valid_1's rmse: 1.11483
[13]	training's rmse: 1.15705	valid_1's rmse: 1.11316
[14]	t

[141]	training's rmse: 0.959586	valid_1's rmse: 0.974309
[142]	training's rmse: 0.958568	valid_1's rmse: 0.973633
[143]	training's rmse: 0.95754	valid_1's rmse: 0.972863
[144]	training's rmse: 0.956616	valid_1's rmse: 0.972329
[145]	training's rmse: 0.95571	valid_1's rmse: 0.971867
[146]	training's rmse: 0.954717	valid_1's rmse: 0.971226
[147]	training's rmse: 0.95364	valid_1's rmse: 0.97049
[148]	training's rmse: 0.952583	valid_1's rmse: 0.96974
[149]	training's rmse: 0.951725	valid_1's rmse: 0.969236
[150]	training's rmse: 0.950685	valid_1's rmse: 0.968559
[151]	training's rmse: 0.949906	valid_1's rmse: 0.968066
[152]	training's rmse: 0.9491	valid_1's rmse: 0.967605
[153]	training's rmse: 0.948299	valid_1's rmse: 0.967192
[154]	training's rmse: 0.947284	valid_1's rmse: 0.966477
[155]	training's rmse: 0.946486	valid_1's rmse: 0.96601
[156]	training's rmse: 0.945686	valid_1's rmse: 0.965577
[157]	training's rmse: 0.944917	valid_1's rmse: 0.965162
[158]	training's rmse: 0.944136	valid_1

[286]	training's rmse: 0.869299	valid_1's rmse: 0.923873
[287]	training's rmse: 0.868806	valid_1's rmse: 0.923565
[288]	training's rmse: 0.868432	valid_1's rmse: 0.923432
[289]	training's rmse: 0.868154	valid_1's rmse: 0.923335
[290]	training's rmse: 0.867835	valid_1's rmse: 0.923227
[291]	training's rmse: 0.867513	valid_1's rmse: 0.923116
[292]	training's rmse: 0.867166	valid_1's rmse: 0.923019
[293]	training's rmse: 0.866843	valid_1's rmse: 0.922894
[294]	training's rmse: 0.866557	valid_1's rmse: 0.922775
[295]	training's rmse: 0.866188	valid_1's rmse: 0.922651
[296]	training's rmse: 0.865888	valid_1's rmse: 0.922561
[297]	training's rmse: 0.865424	valid_1's rmse: 0.922334
[298]	training's rmse: 0.864858	valid_1's rmse: 0.921923
[299]	training's rmse: 0.864425	valid_1's rmse: 0.921715
[300]	training's rmse: 0.864002	valid_1's rmse: 0.921441
[301]	training's rmse: 0.863597	valid_1's rmse: 0.921159
[302]	training's rmse: 0.863204	valid_1's rmse: 0.920956
[303]	training's rmse: 0.862892

[431]	training's rmse: 0.829826	valid_1's rmse: 0.908426
[432]	training's rmse: 0.829645	valid_1's rmse: 0.908419
[433]	training's rmse: 0.829406	valid_1's rmse: 0.908341
[434]	training's rmse: 0.829241	valid_1's rmse: 0.908338
[435]	training's rmse: 0.829108	valid_1's rmse: 0.908337
[436]	training's rmse: 0.828953	valid_1's rmse: 0.90834
[437]	training's rmse: 0.828825	valid_1's rmse: 0.908335
[438]	training's rmse: 0.828662	valid_1's rmse: 0.908319
[439]	training's rmse: 0.828424	valid_1's rmse: 0.908235
[440]	training's rmse: 0.828119	valid_1's rmse: 0.908143
[441]	training's rmse: 0.827971	valid_1's rmse: 0.908121
[442]	training's rmse: 0.827771	valid_1's rmse: 0.908006
[443]	training's rmse: 0.827645	valid_1's rmse: 0.907989
[444]	training's rmse: 0.827467	valid_1's rmse: 0.907948
[445]	training's rmse: 0.82715	valid_1's rmse: 0.907827
[446]	training's rmse: 0.826943	valid_1's rmse: 0.907781
[447]	training's rmse: 0.826713	valid_1's rmse: 0.907713
[448]	training's rmse: 0.826491	v

[567]	training's rmse: 0.809076	valid_1's rmse: 0.902798
[568]	training's rmse: 0.808954	valid_1's rmse: 0.9028
[569]	training's rmse: 0.80881	valid_1's rmse: 0.90279
[570]	training's rmse: 0.80874	valid_1's rmse: 0.902791
[571]	training's rmse: 0.808566	valid_1's rmse: 0.90273
[572]	training's rmse: 0.808416	valid_1's rmse: 0.902688
[573]	training's rmse: 0.808256	valid_1's rmse: 0.90265
[574]	training's rmse: 0.808168	valid_1's rmse: 0.902623
[575]	training's rmse: 0.807999	valid_1's rmse: 0.902564
[576]	training's rmse: 0.807919	valid_1's rmse: 0.902612
[577]	training's rmse: 0.807751	valid_1's rmse: 0.902573
[578]	training's rmse: 0.807685	valid_1's rmse: 0.902597
[579]	training's rmse: 0.807606	valid_1's rmse: 0.902595
[580]	training's rmse: 0.807533	valid_1's rmse: 0.902608
[581]	training's rmse: 0.807391	valid_1's rmse: 0.902569
[582]	training's rmse: 0.80726	valid_1's rmse: 0.902555
[583]	training's rmse: 0.807164	valid_1's rmse: 0.902554
[584]	training's rmse: 0.806982	valid_1

[696]	training's rmse: 0.796347	valid_1's rmse: 0.900257
[697]	training's rmse: 0.796277	valid_1's rmse: 0.900231
[698]	training's rmse: 0.79615	valid_1's rmse: 0.900112
[699]	training's rmse: 0.796043	valid_1's rmse: 0.900115
[700]	training's rmse: 0.795974	valid_1's rmse: 0.900071
[701]	training's rmse: 0.795855	valid_1's rmse: 0.900099
[702]	training's rmse: 0.795775	valid_1's rmse: 0.900147
[703]	training's rmse: 0.795719	valid_1's rmse: 0.900143
[704]	training's rmse: 0.795619	valid_1's rmse: 0.90011
[705]	training's rmse: 0.795553	valid_1's rmse: 0.900099
[706]	training's rmse: 0.795473	valid_1's rmse: 0.900052
[707]	training's rmse: 0.79532	valid_1's rmse: 0.899984
[708]	training's rmse: 0.795132	valid_1's rmse: 0.899929
[709]	training's rmse: 0.795054	valid_1's rmse: 0.899929
[710]	training's rmse: 0.794869	valid_1's rmse: 0.899839
[711]	training's rmse: 0.794782	valid_1's rmse: 0.899801
[712]	training's rmse: 0.794663	valid_1's rmse: 0.899706
[713]	training's rmse: 0.794543	va

[824]	training's rmse: 0.786017	valid_1's rmse: 0.897767
[825]	training's rmse: 0.785948	valid_1's rmse: 0.89774
[826]	training's rmse: 0.785879	valid_1's rmse: 0.897737
[827]	training's rmse: 0.785801	valid_1's rmse: 0.897736
[828]	training's rmse: 0.785747	valid_1's rmse: 0.897737
[829]	training's rmse: 0.785673	valid_1's rmse: 0.897743
[830]	training's rmse: 0.785553	valid_1's rmse: 0.897704
[831]	training's rmse: 0.785499	valid_1's rmse: 0.897695
[832]	training's rmse: 0.785446	valid_1's rmse: 0.897712
[833]	training's rmse: 0.785409	valid_1's rmse: 0.897712
[834]	training's rmse: 0.785317	valid_1's rmse: 0.89762
[835]	training's rmse: 0.785228	valid_1's rmse: 0.897568
[836]	training's rmse: 0.785177	valid_1's rmse: 0.897573
[837]	training's rmse: 0.785126	valid_1's rmse: 0.897577
[838]	training's rmse: 0.785085	valid_1's rmse: 0.897605
[839]	training's rmse: 0.785028	valid_1's rmse: 0.897609
[840]	training's rmse: 0.784967	valid_1's rmse: 0.897569
[841]	training's rmse: 0.784899	v

[931]	training's rmse: 0.779784	valid_1's rmse: 0.896547
[932]	training's rmse: 0.779754	valid_1's rmse: 0.89657
[933]	training's rmse: 0.779696	valid_1's rmse: 0.896527
[934]	training's rmse: 0.779636	valid_1's rmse: 0.896533
[935]	training's rmse: 0.779558	valid_1's rmse: 0.896449
[936]	training's rmse: 0.77951	valid_1's rmse: 0.896448
[937]	training's rmse: 0.779473	valid_1's rmse: 0.896449
[938]	training's rmse: 0.779422	valid_1's rmse: 0.896451
[939]	training's rmse: 0.779359	valid_1's rmse: 0.896351
[940]	training's rmse: 0.779319	valid_1's rmse: 0.896344
[941]	training's rmse: 0.779272	valid_1's rmse: 0.896351
[942]	training's rmse: 0.779229	valid_1's rmse: 0.896367
[943]	training's rmse: 0.779176	valid_1's rmse: 0.896364
[944]	training's rmse: 0.779092	valid_1's rmse: 0.896306
[945]	training's rmse: 0.779035	valid_1's rmse: 0.896312
[946]	training's rmse: 0.778978	valid_1's rmse: 0.896298
[947]	training's rmse: 0.778944	valid_1's rmse: 0.896287
[948]	training's rmse: 0.778897	v

# 训练33个月

In [74]:
trainData = matrix_sel_col[matrix_sel_col['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

train_data = lgb.Dataset(data=X_train, label=label_train)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 8,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.005,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9138
[LightGBM] [Info] Number of data points in the train set: 6425094, number of used features: 60
[LightGBM] [Info] Start training from score 0.287729
[1]	training's rmse: 1.18398
[2]	training's rmse: 1.18132
[3]	training's rmse: 1.17895
[4]	training's rmse: 1.17654
[5]	training's rmse: 1.17404
[6]	training's rmse: 1.17141
[7]	training's rmse: 1.16956
[8]	training's rmse: 1.16727
[9]	training's rmse: 1.16474
[10]	training's rmse: 1.16242
[11]	training's rmse: 1.16011
[12]	training's rmse: 1.15783
[13]	training's rmse: 1.15561
[14]	training's rmse: 1.15321
[15]	training's rmse: 1.15091
[16]	training's rmse: 1.14869
[17]	training's rmse: 1.1467
[18]	training's rmse: 1.14457
[19]	training's rmse: 1.14238
[20]	training's rmse: 1.13996
[21]	training's rmse: 1.13817
[22]	training's rmse: 1.13605
[23]	training's rmse: 1.13455
[24]	training's 

[248]	training's rmse: 0.88934
[249]	training's rmse: 0.888791
[250]	training's rmse: 0.888202
[251]	training's rmse: 0.88768
[252]	training's rmse: 0.887071
[253]	training's rmse: 0.886652
[254]	training's rmse: 0.88625
[255]	training's rmse: 0.88585
[256]	training's rmse: 0.885242
[257]	training's rmse: 0.884875
[258]	training's rmse: 0.884296
[259]	training's rmse: 0.883771
[260]	training's rmse: 0.883169
[261]	training's rmse: 0.882793
[262]	training's rmse: 0.882399
[263]	training's rmse: 0.881889
[264]	training's rmse: 0.881507
[265]	training's rmse: 0.880946
[266]	training's rmse: 0.88055
[267]	training's rmse: 0.880124
[268]	training's rmse: 0.879621
[269]	training's rmse: 0.879091
[270]	training's rmse: 0.878404
[271]	training's rmse: 0.877892
[272]	training's rmse: 0.877565
[273]	training's rmse: 0.877173
[274]	training's rmse: 0.876801
[275]	training's rmse: 0.876317
[276]	training's rmse: 0.875966
[277]	training's rmse: 0.875599
[278]	training's rmse: 0.8753
[279]	training'

[499]	training's rmse: 0.821803
[500]	training's rmse: 0.821669
[501]	training's rmse: 0.82156
[502]	training's rmse: 0.821452
[503]	training's rmse: 0.821351
[504]	training's rmse: 0.821115
[505]	training's rmse: 0.821018
[506]	training's rmse: 0.82091
[507]	training's rmse: 0.820834
[508]	training's rmse: 0.820721
[509]	training's rmse: 0.820634
[510]	training's rmse: 0.820516
[511]	training's rmse: 0.820447
[512]	training's rmse: 0.820341
[513]	training's rmse: 0.820124
[514]	training's rmse: 0.81999
[515]	training's rmse: 0.819913
[516]	training's rmse: 0.819687
[517]	training's rmse: 0.819528
[518]	training's rmse: 0.81945
[519]	training's rmse: 0.819321
[520]	training's rmse: 0.819128
[521]	training's rmse: 0.818968
[522]	training's rmse: 0.818886
[523]	training's rmse: 0.818726
[524]	training's rmse: 0.818597
[525]	training's rmse: 0.818459
[526]	training's rmse: 0.818303
[527]	training's rmse: 0.818155
[528]	training's rmse: 0.818003
[529]	training's rmse: 0.817825
[530]	traini

[721]	training's rmse: 0.797683
[722]	training's rmse: 0.797642
[723]	training's rmse: 0.797566
[724]	training's rmse: 0.797481
[725]	training's rmse: 0.79736
[726]	training's rmse: 0.797258
[727]	training's rmse: 0.797213
[728]	training's rmse: 0.797126
[729]	training's rmse: 0.796974
[730]	training's rmse: 0.796917
[731]	training's rmse: 0.796844
[732]	training's rmse: 0.796775
[733]	training's rmse: 0.796686
[734]	training's rmse: 0.796595
[735]	training's rmse: 0.796466
[736]	training's rmse: 0.7964
[737]	training's rmse: 0.796337
[738]	training's rmse: 0.796292
[739]	training's rmse: 0.796259
[740]	training's rmse: 0.796204
[741]	training's rmse: 0.796142
[742]	training's rmse: 0.796084
[743]	training's rmse: 0.796019
[744]	training's rmse: 0.795979
[745]	training's rmse: 0.795897
[746]	training's rmse: 0.795794
[747]	training's rmse: 0.795722
[748]	training's rmse: 0.795652
[749]	training's rmse: 0.79551
[750]	training's rmse: 0.795426
[751]	training's rmse: 0.795361
[752]	traini

[901]	training's rmse: 0.785449
[902]	training's rmse: 0.785387
[903]	training's rmse: 0.785328
[904]	training's rmse: 0.78529
[905]	training's rmse: 0.785229
[906]	training's rmse: 0.785201
[907]	training's rmse: 0.785175
[908]	training's rmse: 0.78508
[909]	training's rmse: 0.784976
[910]	training's rmse: 0.784941
[911]	training's rmse: 0.784832
[912]	training's rmse: 0.784773
[913]	training's rmse: 0.784703
[914]	training's rmse: 0.784653
[915]	training's rmse: 0.784624
[916]	training's rmse: 0.784593
[917]	training's rmse: 0.784554
[918]	training's rmse: 0.784526
[919]	training's rmse: 0.784496
[920]	training's rmse: 0.784433
[921]	training's rmse: 0.784386
[922]	training's rmse: 0.784312
[923]	training's rmse: 0.784255
[924]	training's rmse: 0.784208
[925]	training's rmse: 0.784118
[926]	training's rmse: 0.784073
[927]	training's rmse: 0.783971
[928]	training's rmse: 0.783928
[929]	training's rmse: 0.783863
[930]	training's rmse: 0.783826
[931]	training's rmse: 0.783784
[932]	trai

In [76]:
# test数据
testData = matrix_sel_col[matrix_sel_col['date_block_num'] == 34]
X_test = testData.drop('item_cnt_month', axis=1)
# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)
submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub3.csv', index=False)