In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from icecream import ic
from sklearn.preprocessing import LabelEncoder

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 训练数据
train = pd.read_csv('./data/sales_train.csv')   # (2935849, 6)

# 测试数据
test = pd.read_csv('./data/test.csv')   # (214200, 3)

# 分析商品类别 item_categories添加 split type sub_type type_code sub_type_code 特征
itemcat = pd.read_csv('./data/item_categories.csv')
itemcat['split'] = itemcat.item_category_name.str.split('-')
itemcat['type'] = itemcat['split'].map(lambda x:x[0])
itemcat['sub_type'] = itemcat['split'].map(lambda x:x[1] if len(x) > 1 else x[0])
le = LabelEncoder()
itemcat['type_code'] = le.fit_transform(itemcat.type)
itemcat['sub_type_code'] = le.fit_transform(itemcat.sub_type)

# 合并[train,test],items
items = pd.read_csv('./data/items.csv')
items = items[['item_id', 'item_category_id']]
train = pd.merge(left=train, right=items, on='item_id')
# 对原数据没有改变
test = pd.merge(left=test, right=items, on='item_id')
test = test.sort_values(by='ID')
test = test.reset_index(drop=True)

# 删除训练数据中的异常数据
train = train[train.item_cnt_day <= 1000]
train = train[train.item_price <= 100000]

# 中位数填补item_price为负数的数据
price_df = train[train.item_id == 2973]
train.loc[train.item_price < 0, 'item_price'] = price_df.item_price.median()

# 构造item_cnt_month特征
train2 = train.groupby(['shop_id','item_id','date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()
train2 = train2.rename(columns={'item_cnt_day': 'item_cnt_month'})

# date_block到month的映射
month_lst = list(range(1, 13)) + list(range(1, 13)) + list(range(1, 11))
date_num_lst = list(range(0, 34))
dateBlock2month = dict(tuple(zip(date_num_lst, month_lst)))

# train2添加month特征
train2['month'] = train2['date_block_num'].map(dateBlock2month)

# test添加date_block_num，month特征
test['date_block_num'] = 34
test['month'] = 11

# train2添加item_category_id特征
train2 = pd.merge(left=train2, right=items, on='item_id', how='left')

# train2添加type_code,sub_type_code
itemcat_simple = itemcat[['item_category_id','type_code','sub_type_code']]
train2 = pd.merge(left=train2, right=itemcat_simple, on='item_category_id', how='left')

# test添加type_code,sub_type_code
test = pd.merge(left=test, right=itemcat_simple, on='item_category_id', how='left')

# 分析shops
shops = pd.read_csv('./data/shops.csv')
shops['split']=shops.shop_name.str.split(' ')
shops['shop_city'] = shops['split'].map(lambda x:x[0])
shops['shop_city_code'] = le.fit_transform(shops['shop_city'])

# train2,test添加shop_city_code特征
shops_simple = shops[['shop_id','shop_city_code']]
train2 = pd.merge(left=train2, right=shops_simple, on='shop_id', how='left')
test = pd.merge(left=test, right=shops_simple, on='shop_id', how='left')

In [76]:
train2

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,item_category_id,type_code,sub_type_code,shop_city_code
0,0,30,1,31.0,2,40,11,4,0
1,0,31,1,11.0,2,37,11,1,0
2,0,32,0,6.0,1,40,11,4,0
3,0,32,1,10.0,2,40,11,4,0
4,0,33,0,3.0,1,37,11,1,0
...,...,...,...,...,...,...,...,...,...
1609118,59,22164,27,2.0,4,37,11,1,31
1609119,59,22164,30,1.0,7,37,11,1,31
1609120,59,22167,9,1.0,10,49,12,36,31
1609121,59,22167,11,2.0,12,49,12,36,31


In [82]:
train2[(train2.date_block_num == 0) & (train2.shop_id == 2) & (train2.item_id == 19)]

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,item_category_id,type_code,sub_type_code,shop_city_code


In [86]:
train[(train.item_id == 19)]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
497361,12.01.2013,0,25,19,28.0,1.0,40


In [84]:
train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,02.01.2013,0,59,22154,999.0,1.0,37
1,23.01.2013,0,24,22154,999.0,1.0,37
2,20.01.2013,0,27,22154,999.0,1.0,37
3,02.01.2013,0,25,22154,999.0,1.0,37
4,03.01.2013,0,25,22154,999.0,1.0,37
...,...,...,...,...,...,...,...
2935844,17.10.2015,33,25,8428,249.0,1.0,40
2935845,01.10.2015,33,25,7903,12198.0,1.0,15
2935846,29.10.2015,33,25,7610,2890.0,1.0,64
2935847,22.10.2015,33,25,7635,2100.0,1.0,64


# 新的开始

In [20]:
X = train2.drop(['item_category_id'], axis=1)

In [21]:
X

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,type_code,sub_type_code,shop_city_code
0,0,30,1,31.0,2,11,4,0
1,0,31,1,11.0,2,11,1,0
2,0,32,0,6.0,1,11,4,0
3,0,32,1,10.0,2,11,4,0
4,0,33,0,3.0,1,11,1,0
...,...,...,...,...,...,...,...,...
1609118,59,22164,27,2.0,4,11,1,31
1609119,59,22164,30,1.0,7,11,1,31
1609120,59,22167,9,1.0,10,12,36,31
1609121,59,22167,11,2.0,12,12,36,31


In [22]:
X_train = X[X.date_block_num < 33].drop('item_cnt_month', axis=1)
X_train

Unnamed: 0,shop_id,item_id,date_block_num,month,type_code,sub_type_code,shop_city_code
0,0,30,1,2,11,4,0
1,0,31,1,2,11,1,0
2,0,32,0,1,11,4,0
3,0,32,1,2,11,4,0
4,0,33,0,1,11,1,0
...,...,...,...,...,...,...,...
1609118,59,22164,27,4,11,1,31
1609119,59,22164,30,7,11,1,31
1609120,59,22167,9,10,12,36,31
1609121,59,22167,11,12,12,36,31


In [23]:
y_train = X[X.date_block_num < 33]['item_cnt_month']
y_train

0          31.0
1          11.0
2           6.0
3          10.0
4           3.0
           ... 
1609118     2.0
1609119     1.0
1609120     1.0
1609121     2.0
1609122     1.0
Name: item_cnt_month, Length: 1577592, dtype: float64

In [24]:
X_valid = X[X.date_block_num == 33].drop('item_cnt_month', axis=1)
X_valid

Unnamed: 0,shop_id,item_id,date_block_num,month,type_code,sub_type_code,shop_city_code
8113,2,31,33,10,11,1,1
8205,2,486,33,10,15,0,1
8254,2,787,33,10,12,36,1
8284,2,794,33,10,15,0,1
8367,2,968,33,10,11,4,1
...,...,...,...,...,...,...,...
1608997,59,22087,33,10,20,64,31
1609029,59,22088,33,10,20,64,31
1609046,59,22091,33,10,20,64,31
1609072,59,22100,33,10,12,19,31


In [25]:
y_valid = X[X.date_block_num == 33]['item_cnt_month']
y_valid

8113       1.0
8205       3.0
8254       1.0
8284       1.0
8367       1.0
          ... 
1608997    6.0
1609029    2.0
1609046    1.0
1609072    1.0
1609075    1.0
Name: item_cnt_month, Length: 31531, dtype: float64

# 建模

In [62]:
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

In [67]:
X_train

Unnamed: 0,shop_id,item_id,date_block_num,month,type_code,sub_type_code,shop_city_code
0,0,30,1,2,11,4,0
1,0,31,1,2,11,1,0
2,0,32,0,1,11,4,0
3,0,32,1,2,11,4,0
4,0,33,0,1,11,1,0
...,...,...,...,...,...,...,...
1609118,59,22164,27,4,11,1,31
1609119,59,22164,30,7,11,1,31
1609120,59,22167,9,10,12,36,31
1609121,59,22167,11,12,12,36,31


In [68]:
X_valid

Unnamed: 0,shop_id,item_id,date_block_num,month,type_code,sub_type_code,shop_city_code
8113,2,31,33,10,11,1,1
8205,2,486,33,10,15,0,1
8254,2,787,33,10,12,36,1
8284,2,794,33,10,15,0,1
8367,2,968,33,10,11,4,1
...,...,...,...,...,...,...,...
1608997,59,22087,33,10,20,64,31
1609029,59,22088,33,10,20,64,31
1609046,59,22091,33,10,20,64,31
1609072,59,22100,33,10,12,19,31


In [69]:
train_valid = pd.concat([X_train, X_valid], axis=0, ignore_index=True)
std_scaler = StandardScaler()
std_scaler.fit(train_valid)
X_train_std = std_scaler.transform(X_train)
X_valid_std = std_scaler.transform(X_valid)

In [73]:
train_data = lgb.Dataset(data=X_train, label=y_train)
train_data

<lightgbm.basic.Dataset at 0x16600a30b00>

In [74]:
valid_data = lgb.Dataset(data=X_valid, label=y_valid)
valid_data

<lightgbm.basic.Dataset at 0x1660363d5c0>

In [75]:
params = {'num_leaves':50, 'num_trees':300, 'objective':'regression', 'metric': 'rmse', 'feature_fraction':1, 'max_depth': 20, 'min_data_in_leaf': 20}
lgb.train(params, train_data, valid_sets=[train_data, valid_data])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 468
[LightGBM] [Info] Number of data points in the train set: 1577592, number of used features: 7
[LightGBM] [Info] Start training from score 2.267474
[1]	training's rmse: 8.10124	valid_1's rmse: 6.71526
[2]	training's rmse: 7.77194	valid_1's rmse: 6.37013
[3]	training's rmse: 7.50281	valid_1's rmse: 6.14625
[4]	training's rmse: 7.27433	valid_1's rmse: 5.98006
[5]	training's rmse: 7.07974	valid_1's rmse: 5.92944
[6]	training's rmse: 6.91898	valid_1's rmse: 5.90579
[7]	training's rmse: 6.7687	valid_1's rmse: 5.82853
[8]	training's rmse: 6.64815	valid_1's rmse: 5.85972
[9]	training's rmse: 6.55016	valid_1's rmse: 5.87553
[10]	training's rmse: 6.46473	valid_1's rmse: 5.82034
[11]	training's rmse: 6.38805	valid_1's rmse: 5.90565
[12]	training's rmse: 6.3252	valid_1's rmse: 5.85421
[13]	training's rmse: 6.26693	valid_1's rmse: 5.81176
[14]	tr

[146]	training's rmse: 5.09801	valid_1's rmse: 5.50991
[147]	training's rmse: 5.09721	valid_1's rmse: 5.50952
[148]	training's rmse: 5.09332	valid_1's rmse: 5.51864
[149]	training's rmse: 5.09095	valid_1's rmse: 5.51761
[150]	training's rmse: 5.08971	valid_1's rmse: 5.51839
[151]	training's rmse: 5.08749	valid_1's rmse: 5.52202
[152]	training's rmse: 5.08563	valid_1's rmse: 5.52326
[153]	training's rmse: 5.08494	valid_1's rmse: 5.52241
[154]	training's rmse: 5.08391	valid_1's rmse: 5.52244
[155]	training's rmse: 5.08274	valid_1's rmse: 5.52242
[156]	training's rmse: 5.0812	valid_1's rmse: 5.52226
[157]	training's rmse: 5.07869	valid_1's rmse: 5.52536
[158]	training's rmse: 5.07583	valid_1's rmse: 5.52694
[159]	training's rmse: 5.0728	valid_1's rmse: 5.51204
[160]	training's rmse: 5.06882	valid_1's rmse: 5.49767
[161]	training's rmse: 5.06838	valid_1's rmse: 5.49682
[162]	training's rmse: 5.06614	valid_1's rmse: 5.49653
[163]	training's rmse: 5.06414	valid_1's rmse: 5.49634
[164]	traini

<lightgbm.basic.Booster at 0x1660363dcf8>