### ライブラリ・データの読み込み

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [19]:
train_df = pd.read_csv('./sales_train_v2.csv')
shop_df = pd.read_csv('./shops.csv')
item_df = pd.read_csv('./items.csv')
test_df = pd.read_csv('./test.csv')

In [3]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
shop_df.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [5]:
item_df.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [6]:
test_df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 前処理

In [8]:
item_category_id_df = pd.get_dummies(item_df['item_category_id'], prefix='item_category_id')
item_category_id_df = pd.concat([item_df['item_id'], item_category_id_df], axis=1)

In [9]:
monthly_sales = train_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].count().unstack().reset_index()
monthly_sales.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,,9.0,,,,,,,...,,,,,,,,,,
1,0,31,,7.0,,,,,,,...,,,,,,,,,,
2,0,32,4.0,7.0,,,,,,,...,,,,,,,,,,
3,0,33,3.0,3.0,,,,,,,...,,,,,,,,,,
4,0,35,1.0,11.0,,,,,,,...,,,,,,,,,,


In [10]:
monthly_sales = pd.merge(monthly_sales, item_category_id_df, on='item_id')

In [11]:
monthly_sales = monthly_sales.fillna(0)

### モデル構築

In [13]:
X_train = monthly_sales.drop(['shop_id', 'item_id', 33], axis=1)
Y_train = monthly_sales[33]

In [14]:
lr = Lasso()
lr.fit(X_train, Y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
Y_pred = lr.predict(X_train)
np.sqrt(mean_squared_error(y_true=Y_train, y_pred=Y_pred))

0.655036090425938

### 予測

In [20]:
test_df = pd.merge(test_df, monthly_sales, how='left', on=['shop_id', 'item_id'])

In [22]:
test_df = test_df.fillna(0)

In [23]:
X_test = test_df.drop(['ID', 'shop_id', 'item_id', 33], axis=1)

In [24]:
Y_pred = lr.predict(X_test)

In [25]:
submit_df = pd.DataFrame({'ID':test_df['ID'], 'item_cnt_month': Y_pred})

In [26]:
submit_df

Unnamed: 0,ID,item_cnt_month
0,0,0.126175
1,1,0.126175
2,2,0.126175
3,3,0.126175
4,4,0.126175
5,5,0.126175
6,6,0.126175
7,7,0.126175
8,8,0.126175
9,9,0.126175


In [27]:
submit_df.to_csv('./result.csv', index=False)