# 数据探索

In [1]:
# 训练集。2013年1月至2015年10月的每日历史数据。
import pandas as pd

train_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv",
                        parse_dates = ['date'], infer_datetime_format = True, dayfirst = True)

train_data

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.00,1.0
1,2013-01-03,0,25,2552,899.00,1.0
2,2013-01-05,0,25,2552,899.00,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.00,1.0
2935845,2015-10-09,33,25,7460,299.00,1.0
2935846,2015-10-14,33,25,7459,349.00,1.0
2935847,2015-10-22,33,25,7440,299.00,1.0


- date -- 日期格式为日/月/年

- date_block_num -- 连续的月份编号，用于方便。一月 2013 是 0， 二月 2013 是 1,..., 十月 2015 是 33

- shop_id -- 商店的唯一标识符

- item_id -- 产品的唯一标识符

- item_price -- 商品的当前价格

- item_cnt_day -- 售出的商品数量。您正在预测此度量的每月金额

In [2]:
# test.csv -- 测试集。您需要预测这些商店和产品在 2015 年 11 月的销售情况。
test_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv", index_col=0)

test_data

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268
...,...,...
214195,45,18454
214196,45,16188
214197,45,15757
214198,45,19648


- ID -- 一个 Id，表示测试集中的（商店、商品）元组

- shop_id -- 商店的唯一标识符

- item_id -- 产品的唯一标识符

# 数据预处理

In [3]:
train_data = train_data.drop(['date_block_num', 'item_price'], axis=1)

train_data

Unnamed: 0,date,shop_id,item_id,item_cnt_day
0,2013-01-02,59,22154,1.0
1,2013-01-03,25,2552,1.0
2,2013-01-05,25,2552,-1.0
3,2013-01-06,25,2554,1.0
4,2013-01-15,25,2555,1.0
...,...,...,...,...
2935844,2015-10-10,25,7409,1.0
2935845,2015-10-09,25,7460,1.0
2935846,2015-10-14,25,7459,1.0
2935847,2015-10-22,25,7440,1.0


In [4]:
# 转换日期为 月份
train_data['month'] = train_data['date'].dt.to_period('M')
train_data = train_data.drop(['date'], axis=1)

train_data

Unnamed: 0,shop_id,item_id,item_cnt_day,month
0,59,22154,1.0,2013-01
1,25,2552,1.0,2013-01
2,25,2552,-1.0,2013-01
3,25,2554,1.0,2013-01
4,25,2555,1.0,2013-01
...,...,...,...,...
2935844,25,7409,1.0,2015-10
2935845,25,7460,1.0,2015-10
2935846,25,7459,1.0,2015-10
2935847,25,7440,1.0,2015-10


In [5]:
# 统计每月销量
train_data = train_data.groupby(['month','item_id','shop_id']).sum().reset_index()

train_data

Unnamed: 0,month,item_id,shop_id,item_cnt_day
0,2013-01,19,25,1.0
1,2013-01,27,1,1.0
2,2013-01,27,2,1.0
3,2013-01,27,10,1.0
4,2013-01,27,19,1.0
...,...,...,...,...
1609119,2015-10,22167,28,1.0
1609120,2015-10,22167,37,1.0
1609121,2015-10,22167,42,4.0
1609122,2015-10,22167,47,1.0


In [6]:
# 使用数据透视表将 月份的值 转换为 列
train_data = pd.pivot_table(train_data, 
                            values='item_cnt_day', 
                            index=['item_id', 'shop_id'], 
                            columns='month', 
                            fill_value = 0).reset_index()

train_data

month,item_id,shop_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,22168,12,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,22168,16,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424121,22168,42,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424122,22168,43,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# 将测试集的数据合并到测试集
all_data = pd.merge(test_data, train_data, on = ['item_id', 'shop_id'], how = 'left')

all_data.fillna(0, inplace = True)

all_data.drop(['item_id', 'shop_id'], axis=1, inplace=True)

# 将所有特征的列名称转换为 str
all_data.columns = all_data.columns.astype(str)

all_data

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# 训练模型从 2013-01 到 2015-09 学习预测 2015-10 的销量
x_train = all_data.drop('2015-10', axis=1)

x_train

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2014-12,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
214196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
y_train = all_data[['2015-10']]

y_train

Unnamed: 0,2015-10
0,0.0
1,0.0
2,1.0
3,0.0
4,0.0
...,...
214195,1.0
214196,0.0
214197,0.0
214198,0.0


In [10]:
# 测试集的数据为 2013-02 到了2015-10 ，那么 y_test 就是 2015-11 的销量
x_test = all_data.drop('2013-01', axis=1)

x_test

Unnamed: 0,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,2013-11,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 模型预测

In [11]:
# XGBRegressor 模型
from xgboost import XGBRegressor

other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

xgb = XGBRegressor(**other_params)
xgb.fit(x_train, y_train)

prediction = xgb.predict(x_test).clip(0., 20.)

In [12]:
# from lightgbm import LGBMRegressor

# model_lgb = LGBMRegressor( n_estimators=200,
#                            learning_rate=0.03,
#                            num_leaves=32,
#                            colsample_bytree=0.9497036,
#                            subsample=0.8715623,
#                            max_depth=8,
#                            reg_alpha=0.04,
#                            reg_lambda=0.073,
#                            min_split_gain=0.0222415,
#                            min_child_weight=40)
# model_lgb.fit(x_train, y_train)

# prediction = model_lgb.predict(x_test).clip(0., 20.)

# 提交结果

In [13]:
submission = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")

submission['item_cnt_month'] = prediction

submission

Unnamed: 0,ID,item_cnt_month
0,0,0.485445
1,1,0.123978
2,2,0.798179
3,3,0.159123
4,4,0.123978
...,...,...
214195,214195,0.280784
214196,214196,0.123978
214197,214197,0.107044
214198,214198,0.123978


In [14]:
submission.to_csv('submission.csv',index=False)