In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
# six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
# six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    ic(name)
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)]=(matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0

matrix['delta_price_lag']=matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

# 每个月的天数
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)
matrix['days'] = matrix['days'].astype(np.int8)

# 开始销量
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

ic| name: '!Якутск Орджоникидзе, 56 фран'
ic| name: '!Якутск ТЦ "Центральный" фран'
ic| name: 'Адыгея ТЦ "Мега"'
ic| name: 'Балашиха ТРК "Октябрь-Киномир"'
ic| name: 'Волжский ТЦ "Волга Молл"'
ic| name: 'Вологда ТРЦ "Мармелад"'
ic| name: 'Воронеж (Плехановская, 13)'
ic| name: 'Воронеж ТРЦ "Максимир"'
ic| name: 'Воронеж ТРЦ Сити-Парк "Град"'
ic| name: 'Выездная Торговля'
ic| name: 'Жуковский ул. Чкалова 39м?'
ic| name: 'Жуковский ул. Чкалова 39м²'
ic| name: 'Интернет-магазин ЧС'
ic| name: 'Казань ТЦ "Бехетле"'
ic| name: 'Казань ТЦ "ПаркХаус" II'
ic| name: 'Калуга ТРЦ "XXI век"'
ic| name: 'Коломна ТЦ "Рио"'
ic| name: 'Красноярск ТЦ "Взлетка Плаза"'
ic| name: 'Красноярск ТЦ "Июнь"'
ic| name: 'Курск ТЦ "Пушкинский"'
ic| name: 'Москва "Распродажа"'
ic| name: 'Москва МТРЦ "Афи Молл"'
ic| name: 'Москва Магазин С21'
ic| name: 'Москва ТК "Буденовский" (пав.А2)'
ic| name: 'Москва ТК "Буденовский" (пав.К7)'
ic| name: 'Москва ТРК "Атриум"'
ic| name: 'Москва ТЦ "Ареал" (Беляево)'
ic| name: 'Москва 

In [2]:
matrix.shape

(11128050, 64)

In [3]:
# matrix_11128050_64_bak = matrix.copy()

In [4]:
# 月销量（商店类型）
group = matrix.groupby(['date_block_num','shop_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shoptype_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shoptype_avg_item_cnt')
matrix.drop('date_shoptype_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店类型）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_shoptype_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_shoptype_avg_item_cnt')
matrix.drop('date_item_shoptype_avg_item_cnt', axis=1, inplace=True)

# # 月销量（商店-商品）
# group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
# group.columns = [ 'date_shopitem_avg_item_cnt' ]
# group.reset_index(inplace=True)
# matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
# matrix = lag_features(matrix, [1,2,3,6,12], 'date_shopitem_avg_item_cnt')
# matrix.drop('date_shopitem_avg_item_cnt', axis=1, inplace=True)

In [5]:
group = matrix.groupby('item_id').agg({'item_cnt_month': 'mean'})
group.columns = ['trend_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

In [6]:
group = matrix.groupby(['date_block_num','item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['trend_date_item_avg_cnt_month']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

In [7]:
# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'trend_date_item_avg_cnt_month')
for i in lags:
    matrix['delta_cnt_month_lag_'+str(i)] = (matrix['trend_date_item_avg_cnt_month_lag_' + str(i)] - matrix['trend_item_avg_cnt_month']) / matrix['trend_item_avg_cnt_month']

In [8]:
def select_trend2(row):
    for i in lags:
        if pd.notnull(row['delta_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta_cnt_month_lag'] = matrix.apply(select_trend2, axis=1)
matrix['delta_cnt_month_lag'] = matrix['delta_cnt_month_lag'].astype(np.float16)

In [11]:
matrix.shape

(11128050, 91)

In [12]:
features_to_drop = ['trend_item_avg_cnt_month','trend_date_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['trend_date_item_avg_cnt_month_lag_' + str(i)]
    features_to_drop += ['delta_cnt_month_lag_' + str(i)]

matrix.drop(features_to_drop, axis=1, inplace=True)

In [13]:
matrix.shape

(11128050, 75)

In [14]:
# **********趋势特征 delta2_cnt_month_lag ********************
group = matrix.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_shop_item_avg_cnt_month']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['shop_id', 'item_id'], how='left')

group = matrix.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['qushi_date_shop_item_avg_cnt_month']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'shop_id', 'item_id'], how='left')

# 计算matrix中商品的历史价格
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'qushi_date_shop_item_avg_cnt_month')
for i in lags:
    matrix['delta2_cnt_month_lag_'+str(i)] = (matrix['qushi_date_shop_item_avg_cnt_month_lag_' + str(i)] - matrix['qushi_shop_item_avg_cnt_month']) / matrix['qushi_shop_item_avg_cnt_month']

def select_trend3(row):
    for i in lags:
        if pd.notnull(row['delta2_cnt_month_lag_'+str(i)]):  # 如果不是NaN
            return row['delta2_cnt_month_lag_'+str(i)]
    return 0   #  如果delta_price_lag_都为空，那么将趋势设为0，0代表没有趋势

matrix['delta2_cnt_month_lag'] = matrix.apply(select_trend3, axis=1)
matrix['delta2_cnt_month_lag'] = matrix['delta2_cnt_month_lag'].astype(np.float16)

features_to_drop = ['qushi_shop_item_avg_cnt_month','qushi_date_shop_item_avg_cnt_month']
for i in lags:
    features_to_drop += ['delta2_cnt_month_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

In [15]:
matrix.shape

(11128050, 83)

In [16]:
# matrix_11128050_83_bak = matrix.copy()

In [17]:
# 因为有12个月的延迟特征（1，2，3，6，12）（1，2，3，4，5，6，12），所以需要删除前12月的数据
matrix = matrix[matrix['date_block_num'] > 11]

# 找到有NaN值的列，然后把那些列中的NaN值填充0
columns = matrix.columns
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)

for i in column_null:
    matrix[i].fillna(0, inplace=True)

In [18]:
matrix.shape

(6639294, 83)

In [29]:
matrix.shape

(6639294, 83)

In [28]:
dict(matrix.count())

{'date_block_num': 6639294,
 'shop_id': 6639294,
 'item_id': 6639294,
 'item_cnt_month': 6639294,
 'item_category_id': 6639294,
 'cat_type_code': 6639294,
 'cat_subtype_code': 6639294,
 'shop_city_code': 6639294,
 'shop_type_code': 6639294,
 'item_cnt_month_lag_1': 6639294,
 'item_cnt_month_lag_2': 6639294,
 'item_cnt_month_lag_3': 6639294,
 'item_cnt_month_lag_6': 6639294,
 'item_cnt_month_lag_12': 6639294,
 'date_avg_item_cnt_lag_1': 6639294,
 'date_avg_item_cnt_lag_2': 6639294,
 'date_avg_item_cnt_lag_3': 6639294,
 'date_avg_item_cnt_lag_6': 6639294,
 'date_avg_item_cnt_lag_12': 6639294,
 'date_item_avg_item_cnt_lag_1': 6639294,
 'date_item_avg_item_cnt_lag_2': 6639294,
 'date_item_avg_item_cnt_lag_3': 6639294,
 'date_item_avg_item_cnt_lag_6': 6639294,
 'date_item_avg_item_cnt_lag_12': 6639294,
 'date_shop_avg_item_cnt_lag_1': 6639294,
 'date_shop_avg_item_cnt_lag_2': 6639294,
 'date_shop_avg_item_cnt_lag_3': 6639294,
 'date_shop_avg_item_cnt_lag_6': 6639294,
 'date_shop_avg_item_cn

In [64]:
matrix.shape

(6639294, 83)

# 分析

In [67]:
m33 = matrix[matrix.date_block_num == 33]
m33

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_shoptype_avg_item_cnt_lag_12,delta_cnt_month_lag,qushi_date_shop_item_avg_cnt_month_lag_1,qushi_date_shop_item_avg_cnt_month_lag_2,qushi_date_shop_item_avg_cnt_month_lag_3,qushi_date_shop_item_avg_cnt_month_lag_4,qushi_date_shop_item_avg_cnt_month_lag_5,qushi_date_shop_item_avg_cnt_month_lag_6,qushi_date_shop_item_avg_cnt_month_lag_12,delta2_cnt_month_lag
10675678,33,2,30,0.0,40,11,4,1,4,0.0,...,0.205933,-0.934082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.000000
10675679,33,2,31,1.0,37,11,1,1,4,0.0,...,0.147095,-0.749512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.000000
10675680,33,2,32,0.0,40,11,4,1,4,0.0,...,0.823730,-0.645996,0.0,1.0,0.0,0.0,0.0,0.0,2.0,-1.000000
10675681,33,2,33,0.0,37,11,1,1,4,1.0,...,0.294189,-0.275635,1.0,0.0,1.0,0.0,1.0,1.0,0.0,2.181641
10675682,33,2,40,0.0,57,13,8,1,4,0.0,...,0.000000,-0.700195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10913845,33,59,22162,0.0,40,11,4,31,4,0.0,...,0.000000,-0.877930,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.000000
10913846,33,59,22163,0.0,40,11,4,31,4,0.0,...,0.000000,0.060699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
10913847,33,59,22164,0.0,37,11,1,31,4,0.0,...,0.000000,-0.806641,0.0,0.0,1.0,0.0,0.0,2.0,0.0,-1.000000
10913848,33,59,22166,0.0,54,12,61,31,4,0.0,...,0.000000,-0.556152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [72]:
m33.item_cnt_month.value_counts()

0.0     206701
1.0      21351
2.0       5070
3.0       1907
4.0        959
5.0        582
6.0        341
20.0       261
7.0        226
8.0        158
9.0        138
10.0       103
11.0        65
12.0        63
13.0        54
15.0        50
14.0        46
16.0        34
18.0        25
17.0        21
19.0        17
Name: item_cnt_month, dtype: int64

In [71]:
list(m33.item_cnt_month.values)[:1000]

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

# 训练

In [65]:
"""建模"""
trainData = matrix[matrix['date_block_num'] < 33]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

validData = matrix[matrix['date_block_num'] == 33]
label_valid = validData['item_cnt_month']
X_valid = validData.drop('item_cnt_month', axis=1)

In [66]:
import lightgbm as lgb
train_data = lgb.Dataset(data=X_train, label=label_train)
valid_data = lgb.Dataset(data=X_valid, label=label_valid)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 8,
    'num_leaves': 220,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13242
[LightGBM] [Info] Number of data points in the train set: 6186922, number of used features: 82
[LightGBM] [Info] Start training from score 0.288852
[1]	training's rmse: 1.1832	valid_1's rmse: 1.13277
[2]	training's rmse: 1.17942	valid_1's rmse: 1.13028
[3]	training's rmse: 1.17439	valid_1's rmse: 1.12664
[4]	training's rmse: 1.1691	valid_1's rmse: 1.12216
[5]	training's rmse: 1.16517	valid_1's rmse: 1.11905
[6]	training's rmse: 1.16042	valid_1's rmse: 1.11572
[7]	training's rmse: 1.15553	valid_1's rmse: 1.11206
[8]	training's rmse: 1.15126	valid_1's rmse: 1.10919
[9]	training's rmse: 1.14688	valid_1's rmse: 1.10622
[10]	training's rmse: 1.14263	valid_1's rmse: 1.10273
[11]	training's rmse: 1.13761	valid_1's rmse: 1.0986
[12]	training's rmse: 1.13334	valid_1's rmse: 1.09561
[13]	training's rmse: 1.12949	valid_1's rmse: 1.09298
[14]	

[136]	training's rmse: 0.875868	valid_1's rmse: 0.92694
[137]	training's rmse: 0.875039	valid_1's rmse: 0.926595
[138]	training's rmse: 0.874343	valid_1's rmse: 0.926246
[139]	training's rmse: 0.873568	valid_1's rmse: 0.925954
[140]	training's rmse: 0.872819	valid_1's rmse: 0.925686
[141]	training's rmse: 0.872327	valid_1's rmse: 0.925387
[142]	training's rmse: 0.871587	valid_1's rmse: 0.925143
[143]	training's rmse: 0.870636	valid_1's rmse: 0.924643
[144]	training's rmse: 0.86998	valid_1's rmse: 0.924367
[145]	training's rmse: 0.86942	valid_1's rmse: 0.924029
[146]	training's rmse: 0.868798	valid_1's rmse: 0.923663
[147]	training's rmse: 0.868084	valid_1's rmse: 0.923473
[148]	training's rmse: 0.867348	valid_1's rmse: 0.923055
[149]	training's rmse: 0.866518	valid_1's rmse: 0.922661
[150]	training's rmse: 0.865968	valid_1's rmse: 0.922455
[151]	training's rmse: 0.864967	valid_1's rmse: 0.921886
[152]	training's rmse: 0.864299	valid_1's rmse: 0.921603
[153]	training's rmse: 0.863268	va

[260]	training's rmse: 0.8141	valid_1's rmse: 0.902126
[261]	training's rmse: 0.81374	valid_1's rmse: 0.902064
[262]	training's rmse: 0.813375	valid_1's rmse: 0.901904
[263]	training's rmse: 0.813205	valid_1's rmse: 0.901897
[264]	training's rmse: 0.813022	valid_1's rmse: 0.901917
[265]	training's rmse: 0.812787	valid_1's rmse: 0.901902
[266]	training's rmse: 0.812477	valid_1's rmse: 0.901893
[267]	training's rmse: 0.812332	valid_1's rmse: 0.901894
[268]	training's rmse: 0.811924	valid_1's rmse: 0.901742
[269]	training's rmse: 0.811523	valid_1's rmse: 0.90154
[270]	training's rmse: 0.811191	valid_1's rmse: 0.901475
[271]	training's rmse: 0.810837	valid_1's rmse: 0.901251
[272]	training's rmse: 0.810655	valid_1's rmse: 0.901281
[273]	training's rmse: 0.810281	valid_1's rmse: 0.901111
[274]	training's rmse: 0.809958	valid_1's rmse: 0.901062
[275]	training's rmse: 0.809611	valid_1's rmse: 0.901017
[276]	training's rmse: 0.809373	valid_1's rmse: 0.900966
[277]	training's rmse: 0.809054	val

[356]	training's rmse: 0.793454	valid_1's rmse: 0.898381
[357]	training's rmse: 0.793077	valid_1's rmse: 0.8982
[358]	training's rmse: 0.792816	valid_1's rmse: 0.89812
[359]	training's rmse: 0.792645	valid_1's rmse: 0.898085
[360]	training's rmse: 0.792356	valid_1's rmse: 0.89799
[361]	training's rmse: 0.792148	valid_1's rmse: 0.897745
[362]	training's rmse: 0.792048	valid_1's rmse: 0.897719
[363]	training's rmse: 0.791875	valid_1's rmse: 0.897694
[364]	training's rmse: 0.79178	valid_1's rmse: 0.897717
[365]	training's rmse: 0.791586	valid_1's rmse: 0.897569
[366]	training's rmse: 0.791469	valid_1's rmse: 0.897627
[367]	training's rmse: 0.791336	valid_1's rmse: 0.897554
[368]	training's rmse: 0.791173	valid_1's rmse: 0.897528
[369]	training's rmse: 0.790922	valid_1's rmse: 0.897401
[370]	training's rmse: 0.790683	valid_1's rmse: 0.897351
[371]	training's rmse: 0.790574	valid_1's rmse: 0.897394
[372]	training's rmse: 0.790379	valid_1's rmse: 0.897255
[373]	training's rmse: 0.790211	vali

[439]	training's rmse: 0.780781	valid_1's rmse: 0.895395
[440]	training's rmse: 0.780644	valid_1's rmse: 0.895492
[441]	training's rmse: 0.780579	valid_1's rmse: 0.895502
[442]	training's rmse: 0.780482	valid_1's rmse: 0.89553
[443]	training's rmse: 0.780385	valid_1's rmse: 0.895523
[444]	training's rmse: 0.780216	valid_1's rmse: 0.895557
[445]	training's rmse: 0.780092	valid_1's rmse: 0.895533
[446]	training's rmse: 0.779983	valid_1's rmse: 0.895512
[447]	training's rmse: 0.779844	valid_1's rmse: 0.895316
[448]	training's rmse: 0.779705	valid_1's rmse: 0.89533
[449]	training's rmse: 0.779557	valid_1's rmse: 0.89521
[450]	training's rmse: 0.779464	valid_1's rmse: 0.895188
[451]	training's rmse: 0.779302	valid_1's rmse: 0.895091
[452]	training's rmse: 0.779191	valid_1's rmse: 0.895081
[453]	training's rmse: 0.779124	valid_1's rmse: 0.895069
[454]	training's rmse: 0.778741	valid_1's rmse: 0.89469
[455]	training's rmse: 0.778525	valid_1's rmse: 0.89463
[456]	training's rmse: 0.778452	vali

[517]	training's rmse: 0.77192	valid_1's rmse: 0.893011
[518]	training's rmse: 0.771745	valid_1's rmse: 0.892935
[519]	training's rmse: 0.771693	valid_1's rmse: 0.89296
[520]	training's rmse: 0.771637	valid_1's rmse: 0.89297
[521]	training's rmse: 0.771559	valid_1's rmse: 0.89295
[522]	training's rmse: 0.771439	valid_1's rmse: 0.892871
[523]	training's rmse: 0.771346	valid_1's rmse: 0.892893
[524]	training's rmse: 0.771183	valid_1's rmse: 0.892694
[525]	training's rmse: 0.771093	valid_1's rmse: 0.892652
[526]	training's rmse: 0.771022	valid_1's rmse: 0.892636
[527]	training's rmse: 0.770963	valid_1's rmse: 0.892643
[528]	training's rmse: 0.770886	valid_1's rmse: 0.892688
[529]	training's rmse: 0.770525	valid_1's rmse: 0.892613
[530]	training's rmse: 0.77045	valid_1's rmse: 0.892602
[531]	training's rmse: 0.770371	valid_1's rmse: 0.892494
[532]	training's rmse: 0.770304	valid_1's rmse: 0.892467
[533]	training's rmse: 0.770212	valid_1's rmse: 0.892421
[534]	training's rmse: 0.770138	vali

[592]	training's rmse: 0.764777	valid_1's rmse: 0.891771
[593]	training's rmse: 0.764714	valid_1's rmse: 0.891798
[594]	training's rmse: 0.764472	valid_1's rmse: 0.89191
[595]	training's rmse: 0.764427	valid_1's rmse: 0.891948
[596]	training's rmse: 0.764394	valid_1's rmse: 0.891972
[597]	training's rmse: 0.764349	valid_1's rmse: 0.891953
[598]	training's rmse: 0.764257	valid_1's rmse: 0.891969
[599]	training's rmse: 0.764211	valid_1's rmse: 0.892005
[600]	training's rmse: 0.764166	valid_1's rmse: 0.891954
[601]	training's rmse: 0.764052	valid_1's rmse: 0.891934
[602]	training's rmse: 0.763956	valid_1's rmse: 0.891921
[603]	training's rmse: 0.763795	valid_1's rmse: 0.891807
[604]	training's rmse: 0.763585	valid_1's rmse: 0.891775
[605]	training's rmse: 0.763499	valid_1's rmse: 0.89188
[606]	training's rmse: 0.763457	valid_1's rmse: 0.891874
[607]	training's rmse: 0.763382	valid_1's rmse: 0.891843
[608]	training's rmse: 0.763327	valid_1's rmse: 0.891851
[609]	training's rmse: 0.763282	v

[669]	training's rmse: 0.758602	valid_1's rmse: 0.890146
[670]	training's rmse: 0.758508	valid_1's rmse: 0.890177
[671]	training's rmse: 0.758434	valid_1's rmse: 0.890168
[672]	training's rmse: 0.758394	valid_1's rmse: 0.890176
[673]	training's rmse: 0.758359	valid_1's rmse: 0.890183
[674]	training's rmse: 0.758303	valid_1's rmse: 0.890084
[675]	training's rmse: 0.758233	valid_1's rmse: 0.889901
[676]	training's rmse: 0.75818	valid_1's rmse: 0.889864
[677]	training's rmse: 0.758098	valid_1's rmse: 0.889865
[678]	training's rmse: 0.758039	valid_1's rmse: 0.889837
[679]	training's rmse: 0.757907	valid_1's rmse: 0.889737
[680]	training's rmse: 0.757868	valid_1's rmse: 0.88975
[681]	training's rmse: 0.757718	valid_1's rmse: 0.889679
[682]	training's rmse: 0.75765	valid_1's rmse: 0.889664
[683]	training's rmse: 0.757609	valid_1's rmse: 0.88967
[684]	training's rmse: 0.757543	valid_1's rmse: 0.889669
[685]	training's rmse: 0.757463	valid_1's rmse: 0.889656
[686]	training's rmse: 0.757426	val

[745]	training's rmse: 0.753838	valid_1's rmse: 0.887951
[746]	training's rmse: 0.753805	valid_1's rmse: 0.887954
[747]	training's rmse: 0.753657	valid_1's rmse: 0.887803
[748]	training's rmse: 0.753515	valid_1's rmse: 0.887803
[749]	training's rmse: 0.753473	valid_1's rmse: 0.887777
[750]	training's rmse: 0.753357	valid_1's rmse: 0.887684
[751]	training's rmse: 0.75298	valid_1's rmse: 0.887718
[752]	training's rmse: 0.75292	valid_1's rmse: 0.887678
[753]	training's rmse: 0.752871	valid_1's rmse: 0.887654
[754]	training's rmse: 0.752808	valid_1's rmse: 0.88763
[755]	training's rmse: 0.752774	valid_1's rmse: 0.887634
[756]	training's rmse: 0.752705	valid_1's rmse: 0.887613
[757]	training's rmse: 0.752674	valid_1's rmse: 0.8876
[758]	training's rmse: 0.752629	valid_1's rmse: 0.887614
[759]	training's rmse: 0.752596	valid_1's rmse: 0.887615
[760]	training's rmse: 0.752553	valid_1's rmse: 0.887619
[761]	training's rmse: 0.752506	valid_1's rmse: 0.887603
[762]	training's rmse: 0.752438	vali

[819]	training's rmse: 0.748667	valid_1's rmse: 0.886703
[820]	training's rmse: 0.748608	valid_1's rmse: 0.886624
[821]	training's rmse: 0.748552	valid_1's rmse: 0.886585
[822]	training's rmse: 0.748499	valid_1's rmse: 0.886567
[823]	training's rmse: 0.748452	valid_1's rmse: 0.886565
[824]	training's rmse: 0.748385	valid_1's rmse: 0.886489
[825]	training's rmse: 0.748334	valid_1's rmse: 0.886485
[826]	training's rmse: 0.748279	valid_1's rmse: 0.886437
[827]	training's rmse: 0.748243	valid_1's rmse: 0.886435
[828]	training's rmse: 0.748191	valid_1's rmse: 0.886452
[829]	training's rmse: 0.748163	valid_1's rmse: 0.886447
[830]	training's rmse: 0.748128	valid_1's rmse: 0.88644
[831]	training's rmse: 0.748064	valid_1's rmse: 0.886473
[832]	training's rmse: 0.748035	valid_1's rmse: 0.886482
[833]	training's rmse: 0.748007	valid_1's rmse: 0.886437
[834]	training's rmse: 0.747943	valid_1's rmse: 0.886444
[835]	training's rmse: 0.747916	valid_1's rmse: 0.886445
[836]	training's rmse: 0.747871	

[896]	training's rmse: 0.745206	valid_1's rmse: 0.885798
[897]	training's rmse: 0.745144	valid_1's rmse: 0.885716
[898]	training's rmse: 0.745114	valid_1's rmse: 0.885716
[899]	training's rmse: 0.745079	valid_1's rmse: 0.885717
[900]	training's rmse: 0.745044	valid_1's rmse: 0.885716
[901]	training's rmse: 0.745	valid_1's rmse: 0.885741
[902]	training's rmse: 0.744957	valid_1's rmse: 0.88575
[903]	training's rmse: 0.74489	valid_1's rmse: 0.885646
[904]	training's rmse: 0.744858	valid_1's rmse: 0.885585
[905]	training's rmse: 0.74481	valid_1's rmse: 0.885578
[906]	training's rmse: 0.744753	valid_1's rmse: 0.885558
[907]	training's rmse: 0.744685	valid_1's rmse: 0.885533
[908]	training's rmse: 0.744659	valid_1's rmse: 0.885548
[909]	training's rmse: 0.744612	valid_1's rmse: 0.885574
[910]	training's rmse: 0.744564	valid_1's rmse: 0.885516
[911]	training's rmse: 0.74453	valid_1's rmse: 0.885533
[912]	training's rmse: 0.744496	valid_1's rmse: 0.885518
[913]	training's rmse: 0.744461	valid_

[971]	training's rmse: 0.742034	valid_1's rmse: 0.884844
[972]	training's rmse: 0.741995	valid_1's rmse: 0.884835
[973]	training's rmse: 0.741953	valid_1's rmse: 0.88484
[974]	training's rmse: 0.741929	valid_1's rmse: 0.884817
[975]	training's rmse: 0.741882	valid_1's rmse: 0.884785
[976]	training's rmse: 0.741857	valid_1's rmse: 0.884722
[977]	training's rmse: 0.741795	valid_1's rmse: 0.884701
[978]	training's rmse: 0.741754	valid_1's rmse: 0.884677
[979]	training's rmse: 0.741712	valid_1's rmse: 0.88467
[980]	training's rmse: 0.741672	valid_1's rmse: 0.884636
[981]	training's rmse: 0.741646	valid_1's rmse: 0.884626
[982]	training's rmse: 0.741618	valid_1's rmse: 0.884612
[983]	training's rmse: 0.741582	valid_1's rmse: 0.884598
[984]	training's rmse: 0.741557	valid_1's rmse: 0.884569
[985]	training's rmse: 0.741535	valid_1's rmse: 0.884569
[986]	training's rmse: 0.741483	valid_1's rmse: 0.884541
[987]	training's rmse: 0.74146	valid_1's rmse: 0.884532
[988]	training's rmse: 0.741399	va

# 训练33个月

In [20]:
"""建模"""
trainData = matrix[matrix['date_block_num'] < 34]
label_train = trainData['item_cnt_month']
X_train = trainData.drop('item_cnt_month', axis=1)

train_data = lgb.Dataset(data=X_train, label=label_train)
params = {
    'objective': 'regression',  # 回归
    'metric': 'rmse',   # 回归问题选择rmse
    'n_estimators': 1000,
    'max_depth': 7,
    'num_leaves': 200,   # 每个弱学习器拥有的叶子的数量
    'learning_rate': 0.005,
    'bagging_fraction': 0.9,    # 每次训练“弱学习器”用的数据比例（应该也是随机的），用于加快训练速度和减小过拟合
    'feature_fraction': 0.3,   # 每次迭代过程中，随机选择30%的特征建树（弱学习器）
    'bagging_seed': 0,
    'early_stop_rounds': 50
}
lgb_model = lgb.train(params, train_data, valid_sets=[train_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13284
[LightGBM] [Info] Number of data points in the train set: 6425094, number of used features: 82
[LightGBM] [Info] Start training from score 0.287729
[1]	training's rmse: 1.18396
[2]	training's rmse: 1.18218
[3]	training's rmse: 1.17973
[4]	training's rmse: 1.17714
[5]	training's rmse: 1.17528
[6]	training's rmse: 1.17291
[7]	training's rmse: 1.17052
[8]	training's rmse: 1.16843
[9]	training's rmse: 1.16629
[10]	training's rmse: 1.16415
[11]	training's rmse: 1.16159
[12]	training's rmse: 1.15939
[13]	training's rmse: 1.15741
[14]	training's rmse: 1.15547
[15]	training's rmse: 1.15324
[16]	training's rmse: 1.15115
[17]	training's rmse: 1.1491
[18]	training's rmse: 1.14695
[19]	training's rmse: 1.14501
[20]	training's rmse: 1.14272
[21]	training's rmse: 1.1406
[22]	training's rmse: 1.13835
[23]	training's rmse: 1.13622
[24]	training's 

[107]	training's rmse: 1.00599
[108]	training's rmse: 1.00486
[109]	training's rmse: 1.0037
[110]	training's rmse: 1.00253
[111]	training's rmse: 1.00123
[112]	training's rmse: 0.999911
[113]	training's rmse: 0.998917
[114]	training's rmse: 0.997777
[115]	training's rmse: 0.996756
[116]	training's rmse: 0.995662
[117]	training's rmse: 0.994634
[118]	training's rmse: 0.993509
[119]	training's rmse: 0.992426
[120]	training's rmse: 0.991509
[121]	training's rmse: 0.990494
[122]	training's rmse: 0.989474
[123]	training's rmse: 0.988256
[124]	training's rmse: 0.987304
[125]	training's rmse: 0.986622
[126]	training's rmse: 0.985604
[127]	training's rmse: 0.984762
[128]	training's rmse: 0.983796
[129]	training's rmse: 0.982796
[130]	training's rmse: 0.981949
[131]	training's rmse: 0.981125
[132]	training's rmse: 0.980421
[133]	training's rmse: 0.979487
[134]	training's rmse: 0.978322
[135]	training's rmse: 0.977431
[136]	training's rmse: 0.976485
[137]	training's rmse: 0.975529
[138]	training

[209]	training's rmse: 0.92042
[210]	training's rmse: 0.919901
[211]	training's rmse: 0.919191
[212]	training's rmse: 0.918711
[213]	training's rmse: 0.917931
[214]	training's rmse: 0.917243
[215]	training's rmse: 0.916724
[216]	training's rmse: 0.916065
[217]	training's rmse: 0.915563
[218]	training's rmse: 0.915112
[219]	training's rmse: 0.914623
[220]	training's rmse: 0.914101
[221]	training's rmse: 0.913583
[222]	training's rmse: 0.913163
[223]	training's rmse: 0.912637
[224]	training's rmse: 0.912173
[225]	training's rmse: 0.911717
[226]	training's rmse: 0.911191
[227]	training's rmse: 0.910633
[228]	training's rmse: 0.909906
[229]	training's rmse: 0.909438
[230]	training's rmse: 0.909006
[231]	training's rmse: 0.908579
[232]	training's rmse: 0.908115
[233]	training's rmse: 0.907668
[234]	training's rmse: 0.907168
[235]	training's rmse: 0.906511
[236]	training's rmse: 0.906049
[237]	training's rmse: 0.905418
[238]	training's rmse: 0.904961
[239]	training's rmse: 0.904315
[240]	tra

[294]	training's rmse: 0.880683
[295]	training's rmse: 0.88038
[296]	training's rmse: 0.879945
[297]	training's rmse: 0.879616
[298]	training's rmse: 0.879328
[299]	training's rmse: 0.878857
[300]	training's rmse: 0.878585
[301]	training's rmse: 0.878318
[302]	training's rmse: 0.878028
[303]	training's rmse: 0.877582
[304]	training's rmse: 0.877169
[305]	training's rmse: 0.87692
[306]	training's rmse: 0.87659
[307]	training's rmse: 0.876324
[308]	training's rmse: 0.8759
[309]	training's rmse: 0.875578
[310]	training's rmse: 0.87531
[311]	training's rmse: 0.875051
[312]	training's rmse: 0.874708
[313]	training's rmse: 0.874449
[314]	training's rmse: 0.874209
[315]	training's rmse: 0.873949
[316]	training's rmse: 0.873687
[317]	training's rmse: 0.873271
[318]	training's rmse: 0.873003
[319]	training's rmse: 0.872608
[320]	training's rmse: 0.872251
[321]	training's rmse: 0.871983
[322]	training's rmse: 0.871698
[323]	training's rmse: 0.871467
[324]	training's rmse: 0.871199
[325]	training

[379]	training's rmse: 0.856653
[380]	training's rmse: 0.856389
[381]	training's rmse: 0.85619
[382]	training's rmse: 0.855878
[383]	training's rmse: 0.855616
[384]	training's rmse: 0.855496
[385]	training's rmse: 0.855333
[386]	training's rmse: 0.855176
[387]	training's rmse: 0.854979
[388]	training's rmse: 0.854785
[389]	training's rmse: 0.854642
[390]	training's rmse: 0.85448
[391]	training's rmse: 0.854297
[392]	training's rmse: 0.853887
[393]	training's rmse: 0.853622
[394]	training's rmse: 0.853434
[395]	training's rmse: 0.853249
[396]	training's rmse: 0.852989
[397]	training's rmse: 0.852725
[398]	training's rmse: 0.852417
[399]	training's rmse: 0.852124
[400]	training's rmse: 0.851959
[401]	training's rmse: 0.851689
[402]	training's rmse: 0.85149
[403]	training's rmse: 0.85123
[404]	training's rmse: 0.850938
[405]	training's rmse: 0.850782
[406]	training's rmse: 0.850619
[407]	training's rmse: 0.850302
[408]	training's rmse: 0.85001
[409]	training's rmse: 0.849864
[410]	trainin

[462]	training's rmse: 0.840418
[463]	training's rmse: 0.84022
[464]	training's rmse: 0.840129
[465]	training's rmse: 0.840009
[466]	training's rmse: 0.839853
[467]	training's rmse: 0.83976
[468]	training's rmse: 0.839615
[469]	training's rmse: 0.839323
[470]	training's rmse: 0.839207
[471]	training's rmse: 0.83906
[472]	training's rmse: 0.838937
[473]	training's rmse: 0.838839
[474]	training's rmse: 0.838694
[475]	training's rmse: 0.8386
[476]	training's rmse: 0.838406
[477]	training's rmse: 0.83832
[478]	training's rmse: 0.838222
[479]	training's rmse: 0.838114
[480]	training's rmse: 0.837908
[481]	training's rmse: 0.837804
[482]	training's rmse: 0.837635
[483]	training's rmse: 0.837553
[484]	training's rmse: 0.837357
[485]	training's rmse: 0.837269
[486]	training's rmse: 0.837135
[487]	training's rmse: 0.836909
[488]	training's rmse: 0.836707
[489]	training's rmse: 0.836486
[490]	training's rmse: 0.836309
[491]	training's rmse: 0.836113
[492]	training's rmse: 0.83601
[493]	training'

[543]	training's rmse: 0.829445
[544]	training's rmse: 0.829331
[545]	training's rmse: 0.829192
[546]	training's rmse: 0.829119
[547]	training's rmse: 0.829032
[548]	training's rmse: 0.828908
[549]	training's rmse: 0.828759
[550]	training's rmse: 0.828703
[551]	training's rmse: 0.828564
[552]	training's rmse: 0.828478
[553]	training's rmse: 0.828375
[554]	training's rmse: 0.828308
[555]	training's rmse: 0.828234
[556]	training's rmse: 0.828166
[557]	training's rmse: 0.827947
[558]	training's rmse: 0.827827
[559]	training's rmse: 0.827711
[560]	training's rmse: 0.827621
[561]	training's rmse: 0.827528
[562]	training's rmse: 0.827443
[563]	training's rmse: 0.827343
[564]	training's rmse: 0.827213
[565]	training's rmse: 0.827073
[566]	training's rmse: 0.826987
[567]	training's rmse: 0.826855
[568]	training's rmse: 0.826778
[569]	training's rmse: 0.82672
[570]	training's rmse: 0.82664
[571]	training's rmse: 0.826504
[572]	training's rmse: 0.826312
[573]	training's rmse: 0.826204
[574]	trai

[621]	training's rmse: 0.821178
[622]	training's rmse: 0.821061
[623]	training's rmse: 0.820985
[624]	training's rmse: 0.820944
[625]	training's rmse: 0.820817
[626]	training's rmse: 0.820742
[627]	training's rmse: 0.82059
[628]	training's rmse: 0.820503
[629]	training's rmse: 0.820414
[630]	training's rmse: 0.820348
[631]	training's rmse: 0.820231
[632]	training's rmse: 0.820187
[633]	training's rmse: 0.820127
[634]	training's rmse: 0.820057
[635]	training's rmse: 0.819983
[636]	training's rmse: 0.819803
[637]	training's rmse: 0.819731
[638]	training's rmse: 0.819615
[639]	training's rmse: 0.819547
[640]	training's rmse: 0.819431
[641]	training's rmse: 0.81935
[642]	training's rmse: 0.819307
[643]	training's rmse: 0.819245
[644]	training's rmse: 0.81913
[645]	training's rmse: 0.819053
[646]	training's rmse: 0.818932
[647]	training's rmse: 0.818885
[648]	training's rmse: 0.818822
[649]	training's rmse: 0.818772
[650]	training's rmse: 0.818719
[651]	training's rmse: 0.818664
[652]	train

[698]	training's rmse: 0.815012
[699]	training's rmse: 0.814907
[700]	training's rmse: 0.8148
[701]	training's rmse: 0.814698
[702]	training's rmse: 0.814603
[703]	training's rmse: 0.814545
[704]	training's rmse: 0.814458
[705]	training's rmse: 0.814409
[706]	training's rmse: 0.81436
[707]	training's rmse: 0.814292
[708]	training's rmse: 0.814233
[709]	training's rmse: 0.814125
[710]	training's rmse: 0.81405
[711]	training's rmse: 0.813983
[712]	training's rmse: 0.813918
[713]	training's rmse: 0.813872
[714]	training's rmse: 0.813775
[715]	training's rmse: 0.81373
[716]	training's rmse: 0.813673
[717]	training's rmse: 0.813607
[718]	training's rmse: 0.813488
[719]	training's rmse: 0.813433
[720]	training's rmse: 0.813339
[721]	training's rmse: 0.813299
[722]	training's rmse: 0.813203
[723]	training's rmse: 0.813161
[724]	training's rmse: 0.813036
[725]	training's rmse: 0.812974
[726]	training's rmse: 0.81292
[727]	training's rmse: 0.812842
[728]	training's rmse: 0.812795
[729]	training

[776]	training's rmse: 0.809174
[777]	training's rmse: 0.809139
[778]	training's rmse: 0.809081
[779]	training's rmse: 0.809015
[780]	training's rmse: 0.808956
[781]	training's rmse: 0.808911
[782]	training's rmse: 0.808789
[783]	training's rmse: 0.808703
[784]	training's rmse: 0.808628
[785]	training's rmse: 0.808583
[786]	training's rmse: 0.808452
[787]	training's rmse: 0.808376
[788]	training's rmse: 0.808315
[789]	training's rmse: 0.808244
[790]	training's rmse: 0.808122
[791]	training's rmse: 0.808059
[792]	training's rmse: 0.808029
[793]	training's rmse: 0.807982
[794]	training's rmse: 0.807939
[795]	training's rmse: 0.807894
[796]	training's rmse: 0.807821
[797]	training's rmse: 0.807779
[798]	training's rmse: 0.80764
[799]	training's rmse: 0.807559
[800]	training's rmse: 0.807505
[801]	training's rmse: 0.807472
[802]	training's rmse: 0.807408
[803]	training's rmse: 0.807326
[804]	training's rmse: 0.807266
[805]	training's rmse: 0.807188
[806]	training's rmse: 0.807148
[807]	tra

[853]	training's rmse: 0.804278
[854]	training's rmse: 0.804255
[855]	training's rmse: 0.80422
[856]	training's rmse: 0.804144
[857]	training's rmse: 0.804056
[858]	training's rmse: 0.804029
[859]	training's rmse: 0.803968
[860]	training's rmse: 0.803929
[861]	training's rmse: 0.803888
[862]	training's rmse: 0.803851
[863]	training's rmse: 0.803826
[864]	training's rmse: 0.80379
[865]	training's rmse: 0.803758
[866]	training's rmse: 0.803686
[867]	training's rmse: 0.80364
[868]	training's rmse: 0.803578
[869]	training's rmse: 0.803505
[870]	training's rmse: 0.803456
[871]	training's rmse: 0.803425
[872]	training's rmse: 0.803354
[873]	training's rmse: 0.803285
[874]	training's rmse: 0.803223
[875]	training's rmse: 0.803195
[876]	training's rmse: 0.803166
[877]	training's rmse: 0.803093
[878]	training's rmse: 0.803052
[879]	training's rmse: 0.803001
[880]	training's rmse: 0.802962
[881]	training's rmse: 0.80288
[882]	training's rmse: 0.802849
[883]	training's rmse: 0.802804
[884]	traini

[931]	training's rmse: 0.800073
[932]	training's rmse: 0.800019
[933]	training's rmse: 0.799998
[934]	training's rmse: 0.799964
[935]	training's rmse: 0.799928
[936]	training's rmse: 0.799793
[937]	training's rmse: 0.799739
[938]	training's rmse: 0.799701
[939]	training's rmse: 0.799675
[940]	training's rmse: 0.799643
[941]	training's rmse: 0.799611
[942]	training's rmse: 0.799571
[943]	training's rmse: 0.799531
[944]	training's rmse: 0.799466
[945]	training's rmse: 0.799404
[946]	training's rmse: 0.799345
[947]	training's rmse: 0.799285
[948]	training's rmse: 0.799252
[949]	training's rmse: 0.799229
[950]	training's rmse: 0.799205
[951]	training's rmse: 0.799162
[952]	training's rmse: 0.799118
[953]	training's rmse: 0.799064
[954]	training's rmse: 0.799015
[955]	training's rmse: 0.798976
[956]	training's rmse: 0.798919
[957]	training's rmse: 0.798871
[958]	training's rmse: 0.798833
[959]	training's rmse: 0.798795
[960]	training's rmse: 0.798766
[961]	training's rmse: 0.798731
[962]	tr

In [22]:
six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 34,
 35,
 36,
 37,
 41,
 43,
 44,
 46,
 47,
 48,
 50,
 52,
 54,
 55,
 56,
 58,
 60,
 61,
 63,
 65,
 66,
 68,
 71,
 72,
 73,
 74,
 75,
 78,
 81,
 82,
 84,
 85,
 86,
 88,
 89,
 90,
 92,
 94,
 95,
 96,
 97,
 98,
 99,
 101,
 102,
 105,
 106,
 108,
 111,
 112,
 114,
 115,
 116,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 141,
 144,
 145,
 151,
 152,
 164,
 165,
 169,
 170,
 175,
 176,
 177,
 178,
 179,
 197,
 198,
 199,
 200,
 209,
 210,
 215,
 218,
 220,
 221,
 222,
 223,
 224,
 227,
 230,
 232,
 233,
 235,
 236,
 242,
 243,
 244,
 246,
 247,
 250,
 251,
 256,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 270,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 282,
 285,
 290,
 297,
 300,
 303,
 305,
 306,
 307,
 309,
 310,
 311,
 312,
 316,
 317,
 318,
 319,
 320,
 32

In [25]:
# test数据
testData = matrix[matrix['date_block_num'] == 34]
X_test = testData.drop('item_cnt_month', axis=1)

# 预测&生成文件
y_test = lgb_model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({ 'ID': range(0, 214200), 'item_cnt_month': y_test})

test0 = test[test.item_id.isin(six_zero_item_id)]
ids = list(test0.ID.values)
submission.loc[submission.ID.isin(ids), 'item_cnt_month'] = 0.0
submission.to_csv('./submit/sub5.csv', index=False)

# 分析生成数据

In [87]:
sub = submission.copy()

In [88]:
dict(sub.item_cnt_month.value_counts())

{0.0: 32489,
 0.30174069734063197: 80,
 0.3445569438650551: 80,
 0.1937535312142451: 72,
 0.24272050118959446: 72,
 0.6145144028295059: 68,
 0.28455048002475836: 60,
 0.28134550022439564: 60,
 0.17656331389837196: 54,
 0.17335833409800916: 54,
 0.6248770471979335: 51,
 0.5924414541879562: 51,
 0.5844818282108888: 51,
 0.7644857872235836: 48,
 0.6186236629085988: 44,
 0.28650318713321715: 44,
 0.23778428502771426: 44,
 0.06544754537438215: 40,
 0.6141128117415696: 40,
 0.6116579694977674: 36,
 0.7321192310170864: 36,
 0.19263747472226594: 36,
 0.24160444469761533: 36,
 0.7479815053328611: 36,
 0.8603593642171734: 36,
 1.146841073962878: 34,
 0.21738908791147799: 33,
 0.5885910882899819: 33,
 0.6292057231105384: 33,
 0.22059406771184073: 33,
 0.5965507142670492: 33,
 0.5840802371229525: 30,
 0.6244754561099972: 30,
 0.59203986310002: 30,
 0.4420687469744001: 28,
 0.3944467965498531: 28,
 0.2328771864561249: 28,
 0.11957991934726525: 28,
 0.15890698711076753: 28,
 0.21926696604053494: 27,

In [89]:
# 对sub数据进行变换

def myfun(val):
    return round(val, 0)
    
sub['item_cnt_month'] = sub['item_cnt_month'].apply(myfun)

In [90]:
dict(sub.item_cnt_month.value_counts())

{0.0: 185264,
 1.0: 21912,
 2.0: 3196,
 3.0: 1650,
 4.0: 1116,
 5.0: 371,
 6.0: 189,
 7.0: 120,
 8.0: 74,
 9.0: 65,
 10.0: 58,
 13.0: 33,
 19.0: 27,
 11.0: 27,
 12.0: 21,
 17.0: 16,
 14.0: 15,
 20.0: 15,
 15.0: 14,
 16.0: 9,
 18.0: 8}

In [80]:
m33 = matrix[matrix.date_block_num == 33]
dict(m33.item_cnt_month.value_counts())

{0.0: 206701,
 1.0: 21351,
 2.0: 5070,
 3.0: 1907,
 4.0: 959,
 5.0: 582,
 6.0: 341,
 20.0: 261,
 7.0: 226,
 8.0: 158,
 9.0: 138,
 10.0: 103,
 11.0: 65,
 12.0: 63,
 13.0: 54,
 15.0: 50,
 14.0: 46,
 16.0: 34,
 18.0: 25,
 17.0: 21,
 19.0: 17}

In [77]:
m32 = matrix[matrix.date_block_num == 32]
dict(m32.item_cnt_month.value_counts())

{0.0: 189052,
 1.0: 20128,
 2.0: 4810,
 3.0: 1777,
 4.0: 904,
 5.0: 525,
 6.0: 308,
 20.0: 268,
 7.0: 222,
 8.0: 137,
 9.0: 115,
 10.0: 81,
 11.0: 62,
 12.0: 56,
 13.0: 47,
 19.0: 31,
 16.0: 30,
 14.0: 29,
 15.0: 27,
 17.0: 24,
 18.0: 22}

In [78]:
m31 = matrix[matrix.date_block_num == 31]
dict(m31.item_cnt_month.value_counts())

{0.0: 181110,
 1.0: 22413,
 2.0: 5677,
 3.0: 2151,
 4.0: 1044,
 5.0: 683,
 6.0: 431,
 7.0: 255,
 8.0: 154,
 20.0: 123,
 9.0: 121,
 10.0: 92,
 11.0: 59,
 12.0: 55,
 13.0: 41,
 15.0: 30,
 16.0: 24,
 14.0: 22,
 17.0: 18,
 19.0: 17,
 18.0: 16}

In [79]:
m30 = matrix[matrix.date_block_num == 30]
dict(m30.item_cnt_month.value_counts())

{0.0: 195431,
 1.0: 22977,
 2.0: 5720,
 3.0: 2030,
 4.0: 964,
 5.0: 520,
 6.0: 326,
 7.0: 197,
 8.0: 143,
 20.0: 120,
 9.0: 111,
 10.0: 80,
 11.0: 56,
 12.0: 54,
 14.0: 39,
 13.0: 38,
 15.0: 21,
 16.0: 19,
 18.0: 18,
 17.0: 14,
 19.0: 11}

# 新的开始