In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder
import time
from itertools import product
from icecream import ic

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

sales_train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 计算每个商品每个月的销售量，假如某个商品在某个月没有数据，则填充0（即这个月的销售量为0）
sales_by_item_id = sales_train.pivot_table(index=['item_id'], values=['item_cnt_day'], columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id.columns = sales_by_item_id.columns.droplevel().map(str)
sales_by_item_id.columns.values[0] = 'item_id'
sales_by_item_id = sales_by_item_id.rename_axis(None, axis=1)

# 获取最近6个月销售量为0的数据
# six_zero = sales_by_item_id[(sales_by_item_id['28'] == 0) & (sales_by_item_id['29'] == 0) & (sales_by_item_id['30'] == 0) & (sales_by_item_id['31'] == 0) & (sales_by_item_id['32'] == 0) & (sales_by_item_id['33'] == 0)]
# six_zero_item_id = list(six_zero['item_id'].values)   # item_id列表
# test.loc[test.item_id.isin(six_zero_item_id), 'item_cnt_month'] = 0  # 将test数据中（最近六个月销量为0）的数据月销量设为0，有7812个

# 计算每个商店每个月的销量
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'], values=['item_cnt_day'], aggfunc=np.sum, fill_value=0, columns='date_block_num').reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)    # 将两层column转化为一层column,保留下层column
sales_by_shop_id.columns.values[0] = 'shop_id'
sales_by_shop_id = sales_by_shop_id.rename_axis(None, axis=1)   # 将列方向的轴重命名为none

# zero = sales_train[sales_train.date_block_num==0]
# ic(zero.shop_id.unique(), len(zero.item_id.unique()), len(zero.shop_id.unique()), len(zero.shop_id.unique()) * len(zero.item_id.unique()))
# ic(sales_train.shop_id.unique(), len(sales_train.item_id.unique()), len(sales_train.shop_id.unique()), len(sales_train.shop_id.unique()) * len(sales_train.item_id.unique()))

"""组合date_block_num,shop_id,item_id(部分) 总量：10913850"""
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix.sort_values(cols, inplace=True)  # 排序
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']    # 某一天的销售额

# 分组
groupby = sales_train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
groupby = groupby.rename(columns={'item_cnt_day': 'item_cnt_month'})
matrix = matrix.merge(groupby, on=['date_block_num','shop_id','item_id'], how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0, 20)
matrix['item_cnt_month'] = matrix['item_cnt_month'].astype(np.float16)

# test数据
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

# 合并matrix,test
matrix = pd.concat([matrix, test[cols]], ignore_index=True, axis=0)
matrix['item_cnt_month'].fillna(0, inplace=True)

# 商品信息
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
matrix = pd.merge(left=matrix, right=items, on='item_id', how='left')  # 合并

# 商品类别
le = LabelEncoder()
categories = pd.read_csv('./data/item_categories.csv')
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x:x[0].strip())
categories['subtype'] = categories['split'].map(lambda x:x[1].strip() if len(x)>1 else x[0].strip())
categories = categories[['item_category_id','type','subtype']]
categories['cat_type_code'] = le.fit_transform(categories['type'])
categories['cat_subtype_code'] = le.fit_transform(categories['subtype'])
matrix = pd.merge(left=matrix, right=categories[['item_category_id','cat_type_code','cat_subtype_code']], on='item_category_id', how='left')    # 合并

# 商店信息
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

def st(name):
    ic(name)
    if 'ТЦ' in name or 'ТРЦ' in name:
        shopt = 'ТЦ'
    elif 'ТК' in name:
        shopt = 'ТК'
    elif 'ТРК' in name:
        shopt = 'ТРК'
    elif 'МТРЦ' in name:
        shopt = 'МТРЦ'
    else:
        shopt = 'UNKNOWN'
    return shopt
shops['shop_type'] = shops['shop_name'].apply(st)

shops.loc[shops.shop_id == 21, 'shop_type'] = 'МТРЦ'   # 修正
shops['shop_type_code'] = le.fit_transform(shops['shop_type'])
matrix = pd.merge(left=matrix, right=shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')    # 合并
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['cat_type_code'] = matrix['cat_type_code'].astype(np.int8)
matrix['cat_subtype_code'] = matrix['cat_subtype_code'].astype(np.int8)
matrix['shop_city_code'] = matrix['shop_city_code'].astype(np.int8)
matrix['shop_type_code'] = matrix['shop_type_code'].astype(np.int8)


"""历史信息"""

def lag_features(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(left=df, right=shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_features(matrix, [1,2,3,6,12], 'item_cnt_month')

# 月销量（所有商品）
group = matrix.groupby('date_block_num').agg({'item_cnt_month': 'mean'}).reset_index()
group.columns = ['date_block_num', 'date_avg_item_cnt']
matrix = pd.merge(left=matrix, right=group, on='date_block_num', how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_avg_item_cnt')
matrix.drop('date_avg_item_cnt', axis=1, inplace=True)

# 月销量（每一件商品）
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop('date_item_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个商店 ）
group = matrix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop('date_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（每个类别）
group = matrix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_avg_item_cnt')
matrix.drop('date_cat_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品类别-商店）
group = matrix.groupby(['date_block_num','item_category_id','shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_cat_shop_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','item_category_id','shop_id'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_cat_shop_avg_item_cnt')
matrix.drop('date_cat_shop_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品大类）
group = matrix.groupby(['date_block_num','cat_type_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_type_avg_item_cnt')
matrix.drop('date_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商品大类） ++++++++++++ 和 月销量（商品）是重复的，因为每一个商品，类别是确定的，大类也是确定的
group = matrix.groupby(['date_block_num', 'item_id', 'cat_type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_type_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'cat_type_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_type_avg_item_cnt')
matrix.drop('date_item_type_avg_item_cnt', axis=1, inplace=True)

# 月销量（商店城市）
group = matrix.groupby(['date_block_num','shop_city_code']).agg({'item_cnt_month': 'mean'})
group.columns = ['date_city_avg_item_cnt']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on=['date_block_num','shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_city_avg_item_cnt')
matrix.drop('date_city_avg_item_cnt', axis=1, inplace=True)

# 月销量（商品-商店城市）
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_city_avg_item_cnt']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num', 'item_id', 'shop_city_code'], how='left')
matrix = lag_features(matrix, [1,2,3,6,12], 'date_item_city_avg_item_cnt')
matrix.drop('date_item_city_avg_item_cnt', axis=1, inplace=True)

# 趋势特征
group = sales_train.groupby('item_id').agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group = group.reset_index()
matrix = pd.merge(left=matrix, right=group, on='item_id', how='left')

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group = group.reset_index()
matrix=pd.merge(left=matrix, right=group, on=['date_block_num','item_id'], how='left')

matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

# 计算matrix中商品的历史价格 #
lags = [1,2,3,4,5,6,12]
matrix = lag_features(matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix['delta_price_lag_'+str(i)]=(matrix['date_item_avg_item_price_lag_' + str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if pd.notnull(row['delta_price_lag_'+str(i)]):  # 如果不是NaN
            return row['delta_price_lag_'+str(i)]
    return 0
matrix['delta_price_lag']=matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)

features_to_drop = ['item_avg_item_price','date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]
matrix.drop(features_to_drop, axis=1, inplace=True)

ic| name: '!Якутск Орджоникидзе, 56 фран'
ic| name: '!Якутск ТЦ "Центральный" фран'
ic| name: 'Адыгея ТЦ "Мега"'
ic| name: 'Балашиха ТРК "Октябрь-Киномир"'
ic| name: 'Волжский ТЦ "Волга Молл"'
ic| name: 'Вологда ТРЦ "Мармелад"'
ic| name: 'Воронеж (Плехановская, 13)'
ic| name: 'Воронеж ТРЦ "Максимир"'
ic| name: 'Воронеж ТРЦ Сити-Парк "Град"'
ic| name: 'Выездная Торговля'
ic| name: 'Жуковский ул. Чкалова 39м?'
ic| name: 'Жуковский ул. Чкалова 39м²'
ic| name: 'Интернет-магазин ЧС'
ic| name: 'Казань ТЦ "Бехетле"'
ic| name: 'Казань ТЦ "ПаркХаус" II'
ic| name: 'Калуга ТРЦ "XXI век"'
ic| name: 'Коломна ТЦ "Рио"'
ic| name: 'Красноярск ТЦ "Взлетка Плаза"'
ic| name: 'Красноярск ТЦ "Июнь"'
ic| name: 'Курск ТЦ "Пушкинский"'
ic| name: 'Москва "Распродажа"'
ic| name: 'Москва МТРЦ "Афи Молл"'
ic| name: 'Москва Магазин С21'
ic| name: 'Москва ТК "Буденовский" (пав.А2)'
ic| name: 'Москва ТК "Буденовский" (пав.К7)'
ic| name: 'Москва ТРК "Атриум"'
ic| name: 'Москва ТЦ "Ареал" (Беляево)'
ic| name: 'Москва 

In [40]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_city_avg_item_cnt_lag_2,date_city_avg_item_cnt_lag_3,date_city_avg_item_cnt_lag_6,date_city_avg_item_cnt_lag_12,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,,,,,0.000000
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,,,,,0.000000
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,,,,,0.000000
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,,,,,0.000000
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.139038,0.144897,0.143433,,0.5,0.0,0.0,0.0,,-0.475098
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.139038,,,,0.0,0.0,,,,0.081116
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.139038,0.144897,0.143433,0.182007,0.0,0.5,0.0,0.0,0.0,0.155884
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.139038,0.144897,0.143433,,0.0,0.0,0.0,0.0,,-0.091736


# 每个月的天数

In [41]:
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)

In [42]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_city_avg_item_cnt_lag_6,date_city_avg_item_cnt_lag_12,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,,,0.000000,0,31
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,,,0.000000,0,31
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,,,0.000000,0,31
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,,,0.000000,0,31
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,,,0.000000,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.143433,,0.5,0.0,0.0,0.0,,-0.475098,10,30
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,,,0.0,0.0,,,,0.081116,10,30
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.143433,0.182007,0.0,0.5,0.0,0.0,0.0,0.155884,10,30
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.143433,,0.0,0.0,0.0,0.0,,-0.091736,10,30


In [43]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128050 entries, 0 to 11128049
Data columns (total 62 columns):
 #   Column                              Dtype  
---  ------                              -----  
 0   date_block_num                      int8   
 1   shop_id                             int8   
 2   item_id                             int16  
 3   item_cnt_month                      float16
 4   item_category_id                    int8   
 5   cat_type_code                       int8   
 6   cat_subtype_code                    int8   
 7   shop_city_code                      int8   
 8   shop_type_code                      int8   
 9   item_cnt_month_lag_1                float16
 10  item_cnt_month_lag_2                float16
 11  item_cnt_month_lag_3                float16
 12  item_cnt_month_lag_6                float16
 13  item_cnt_month_lag_12               float16
 14  date_avg_item_cnt_lag_1             float16
 15  date_avg_item_cnt_lag_2             float16
 16

In [44]:
matrix['days'] = matrix['days'].astype(np.int8)
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128050 entries, 0 to 11128049
Data columns (total 62 columns):
 #   Column                              Dtype  
---  ------                              -----  
 0   date_block_num                      int8   
 1   shop_id                             int8   
 2   item_id                             int16  
 3   item_cnt_month                      float16
 4   item_category_id                    int8   
 5   cat_type_code                       int8   
 6   cat_subtype_code                    int8   
 7   shop_city_code                      int8   
 8   shop_type_code                      int8   
 9   item_cnt_month_lag_1                float16
 10  item_cnt_month_lag_2                float16
 11  item_cnt_month_lag_3                float16
 12  item_cnt_month_lag_6                float16
 13  item_cnt_month_lag_12               float16
 14  date_avg_item_cnt_lag_1             float16
 15  date_avg_item_cnt_lag_2             float16
 16

# new

In [45]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_city_avg_item_cnt_lag_6,date_city_avg_item_cnt_lag_12,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,,,0.000000,0,31
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,,,0.000000,0,31
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,,,0.000000,0,31
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,,,0.000000,0,31
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,,,0.000000,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.143433,,0.5,0.0,0.0,0.0,,-0.475098,10,30
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,,,0.0,0.0,,,,0.081116,10,30
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.143433,0.182007,0.0,0.5,0.0,0.0,0.0,0.155884,10,30
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.143433,,0.0,0.0,0.0,0.0,,-0.091736,10,30


In [46]:
np.unique(matrix['item_cnt_month'].values)

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20.], dtype=float16)

In [47]:
a = matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
len(a)

11128050

In [48]:
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')

In [49]:
matrix['item_shop_first_sale'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34], dtype=int8)

# item_first_sale

In [50]:
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

In [51]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,0.000000,0,31,0,0
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,0.000000,0,31,0,0
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,0.000000,0,31,0,0
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,0.000000,0,31,0,0
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,0.000000,0,31,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,,,,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,,-0.091736,10,30,11,11


# substract

In [53]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
0,0,0,19,0.0,40,11,4,0,0,,...,,,,,,0.000000,0,31,0,0
1,0,0,27,0.0,19,5,10,0,0,,...,,,,,,0.000000,0,31,0,0
2,0,0,28,0.0,30,8,55,0,0,,...,,,,,,0.000000,0,31,0,0
3,0,0,29,0.0,23,5,16,0,0,,...,,,,,,0.000000,0,31,0,0
4,0,0,32,6.0,40,11,4,0,0,,...,,,,,,0.000000,0,31,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,,,,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,,-0.091736,10,30,11,11


In [54]:
matrix = matrix[matrix['date_block_num'] > 11]
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
4488756,12,2,27,0.0,19,5,10,1,4,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.282715,0,31,12,12
4488757,12,2,30,0.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,,-0.483398,0,31,11,11
4488758,12,2,31,0.0,37,11,1,1,4,0.0,...,0.0,0.0,0.0,0.0,,-0.137451,0,31,11,11
4488759,12,2,32,1.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.407227,0,31,12,12
4488760,12,2,33,1.0,37,11,1,1,4,1.0,...,1.0,2.0,0.0,0.0,1.0,-0.225464,0,31,12,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,,,,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,,-0.091736,10,30,11,11


In [55]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'item_category_id', 'cat_type_code', 'cat_subtype_code',
       'shop_city_code', 'shop_type_code', 'item_cnt_month_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_lag_6',
       'item_cnt_month_lag_12', 'date_avg_item_cnt_lag_1',
       'date_avg_item_cnt_lag_2', 'date_avg_item_cnt_lag_3',
       'date_avg_item_cnt_lag_6', 'date_avg_item_cnt_lag_12',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_cat_avg_item_cnt_lag_2',
       'date_cat_avg_item_cnt_lag_3', 'date_cat_avg_item_cnt_lag_6',
       'date_cat_avg_item_cnt_lag_12', 'date_cat

In [57]:
columns = matrix.columns

In [58]:
column_null = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null.append(i)
column_null

['item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_lag_6',
 'item_cnt_month_lag_12',
 'date_avg_item_cnt_lag_1',
 'date_avg_item_cnt_lag_2',
 'date_avg_item_cnt_lag_3',
 'date_avg_item_cnt_lag_6',
 'date_avg_item_cnt_lag_12',
 'date_item_avg_item_cnt_lag_1',
 'date_item_avg_item_cnt_lag_2',
 'date_item_avg_item_cnt_lag_3',
 'date_item_avg_item_cnt_lag_6',
 'date_item_avg_item_cnt_lag_12',
 'date_shop_avg_item_cnt_lag_1',
 'date_shop_avg_item_cnt_lag_2',
 'date_shop_avg_item_cnt_lag_3',
 'date_shop_avg_item_cnt_lag_6',
 'date_shop_avg_item_cnt_lag_12',
 'date_cat_avg_item_cnt_lag_1',
 'date_cat_avg_item_cnt_lag_2',
 'date_cat_avg_item_cnt_lag_3',
 'date_cat_avg_item_cnt_lag_6',
 'date_cat_avg_item_cnt_lag_12',
 'date_cat_shop_avg_item_cnt_lag_1',
 'date_cat_shop_avg_item_cnt_lag_2',
 'date_cat_shop_avg_item_cnt_lag_3',
 'date_cat_shop_avg_item_cnt_lag_6',
 'date_cat_shop_avg_item_cnt_lag_12',
 'date_type_avg_item_cnt_lag_1',
 'date_type_avg_item_

In [59]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
4488756,12,2,27,0.0,19,5,10,1,4,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.282715,0,31,12,12
4488757,12,2,30,0.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,,-0.483398,0,31,11,11
4488758,12,2,31,0.0,37,11,1,1,4,0.0,...,0.0,0.0,0.0,0.0,,-0.137451,0,31,11,11
4488759,12,2,32,1.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.407227,0,31,12,12
4488760,12,2,33,1.0,37,11,1,1,4,1.0,...,1.0,2.0,0.0,0.0,1.0,-0.225464,0,31,12,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,,,,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,,-0.091736,10,30,11,11


In [60]:
for i in column_null:
    matrix[i].fillna(0, inplace=True)
matrix

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
4488756,12,2,27,0.0,19,5,10,1,4,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.282715,0,31,12,12
4488757,12,2,30,0.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.483398,0,31,11,11
4488758,12,2,31,0.0,37,11,1,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.137451,0,31,11,11
4488759,12,2,32,1.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.407227,0,31,12,12
4488760,12,2,33,1.0,37,11,1,1,4,1.0,...,1.0,2.0,0.0,0.0,1.0,-0.225464,0,31,12,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,0.0,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.091736,10,30,11,11


In [61]:
column_null2 = []
for i in columns:
    if len(matrix[matrix[i].isnull()]) > 0:
        column_null2.append(i)
column_null2

[]

# 建模

In [62]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,cat_type_code,cat_subtype_code,shop_city_code,shop_type_code,item_cnt_month_lag_1,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_2,date_item_city_avg_item_cnt_lag_3,date_item_city_avg_item_cnt_lag_6,date_item_city_avg_item_cnt_lag_12,delta_price_lag,month,days,item_shop_first_sale,item_first_sale
4488756,12,2,27,0.0,19,5,10,1,4,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.282715,0,31,12,12
4488757,12,2,30,0.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.483398,0,31,11,11
4488758,12,2,31,0.0,37,11,1,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.137451,0,31,11,11
4488759,12,2,32,1.0,40,11,4,1,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.407227,0,31,12,12
4488760,12,2,33,1.0,37,11,1,1,4,1.0,...,1.0,2.0,0.0,0.0,1.0,-0.225464,0,31,12,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,55,13,2,21,4,1.0,...,0.5,0.0,0.0,0.0,0.0,-0.475098,10,30,11,11
11128046,34,45,16188,0.0,64,14,42,21,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.081116,10,30,2,2
11128047,34,45,15757,0.0,55,13,2,21,4,0.0,...,0.0,0.5,0.0,0.0,0.0,0.155884,10,30,34,34
11128048,34,45,19648,0.0,40,11,4,21,4,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.091736,10,30,11,11


In [101]:
matrix.shape

(6639294, 64)

In [102]:
import pickle
matrix.to_pickle('./pickle/matrix.pkl')