# レストラン収益予測

データからレストランの年間売上高を予測する  
提出CSVにはidと予測値を記述

## データセット内容

Id        : レストランごとの振り分けID
Open Date : レストランのオープン日  
City      : レストランがある市  
City Group: 市町村のタイプ  大都市、その他  
Type      : レストランの種類(FC: フードコード IL: インライン DT: ドライブスルー MB: モバイル)  
P1,P2-P37 : 座標データ  
revenue   : 収益  

## 実装

### import

In [59]:
# pandas
import pandas as pd
from pandas import Series, DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

### Dataset読み込み

In [60]:
train_df = pd.read_csv('datasets/train.csv')
test_df  = pd.read_csv('datasets/test.csv')

# 内容チェック
train_df.info()
print('---------------------------------------')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
Id            137 non-null int64
Open Date     137 non-null object
City          137 non-null object
City Group    137 non-null object
Type          137 non-null object
P1            137 non-null int64
P2            137 non-null float64
P3            137 non-null float64
P4            137 non-null float64
P5            137 non-null int64
P6            137 non-null int64
P7            137 non-null int64
P8            137 non-null int64
P9            137 non-null int64
P10           137 non-null int64
P11           137 non-null int64
P12           137 non-null int64
P13           137 non-null float64
P14           137 non-null int64
P15           137 non-null int64
P16           137 non-null int64
P17           137 non-null int64
P18           137 non-null int64
P19           137 non-null int64
P20           137 non-null int64
P21           137 non-null int64
P22           137 non-nul

### 各カラムの詳細な調査

In [61]:
# Typeカラム
# unique(): ユニーク(一意)な値のリストをndarrayで返す
# MBは存在しない模様
train_df['Type'].unique()

array(['IL', 'FC', 'DT'], dtype=object)

In [62]:
# City Groupカラム
# 大都市、その他のみ
train_df['City Group'].unique()

array(['Big Cities', 'Other'], dtype=object)

In [63]:
# Cityカラム
train_df['City'].unique()

array(['İstanbul', 'Ankara', 'Diyarbakır', 'Tokat', 'Gaziantep',
       'Afyonkarahisar', 'Edirne', 'Kocaeli', 'Bursa', 'İzmir', 'Sakarya',
       'Elazığ', 'Kayseri', 'Eskişehir', 'Şanlıurfa', 'Samsun', 'Adana',
       'Antalya', 'Kastamonu', 'Uşak', 'Muğla', 'Kırklareli', 'Konya',
       'Karabük', 'Tekirdağ', 'Denizli', 'Balıkesir', 'Aydın', 'Amasya',
       'Kütahya', 'Bolu', 'Trabzon', 'Isparta', 'Osmaniye'], dtype=object)

### Pre-Processing

各特徴が文字列のままだと学習できないのでこれらを0,1のフラグで分類できるようにする

In [64]:
# Type
# 持っている特徴の'IL','FC','DT'をそれぞれ個別のカラムにする
# np.where: 第一引数が成立するなら1をそうでなければ０を返す
# つまりILならILが存在するレコードなら1が返される
train_df['Type_IL'] = np.where(train_df['Type'] == 'IL', 1, 0)
train_df['Type_FC'] = np.where(train_df['Type'] == 'FC', 1, 0)
train_df['Type_DT'] = np.where(train_df['Type'] == 'DT', 1, 0)

test_df['Type_IL'] = np.where(test_df['Type'] == 'IL', 1, 0)
test_df['Type_FC'] = np.where(test_df['Type'] == 'FC', 1, 0)
test_df['Type_DT'] = np.where(test_df['Type'] == 'DT', 1, 0)

# City Group
train_df['Big_Cities'] = np.where(train_df['City Group'] == 'Big Cities', 1, 0)
train_df['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(train_df['Open Date'])).dt.days

test_df['Big_Cities'] = np.where(test_df['City Group'] == 'Big Cities', 1, 0)
test_df['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(test_df['Open Date'])).dt.days

# 不要なカラムはDrop
train_df = train_df.drop('Type', axis=1)
train_df = train_df.drop('City Group', axis=1)
train_df = train_df.drop('City', axis=1)
train_df = train_df.drop('Open Date', axis=1)

test_df = test_df.drop('Type', axis=1)
test_df = test_df.drop('City Group', axis=1)
test_df = test_df.drop('City', axis=1)
test_df = test_df.drop('Open Date', axis=1)

# objectがなくなったか確認
print(train_df.dtypes)

Id              int64
P1              int64
P2            float64
P3            float64
P4            float64
P5              int64
P6              int64
P7              int64
P8              int64
P9              int64
P10             int64
P11             int64
P12             int64
P13           float64
P14             int64
P15             int64
P16             int64
P17             int64
P18             int64
P19             int64
P20             int64
P21             int64
P22             int64
P23             int64
P24             int64
P25             int64
P26           float64
P27           float64
P28           float64
P29           float64
P30             int64
P31             int64
P32             int64
P33             int64
P34             int64
P35             int64
P36             int64
P37             int64
revenue       float64
Type_IL         int64
Type_FC         int64
Type_DT         int64
Big_Cities      int64
Days_Open       int64
dtype: object


### X,Yの分断

In [65]:
from sklearn import model_selection
from sklearn import linear_model

X = train_df.drop(['Id', 'revenue'], axis=1)
Y = train_df.revenue
print(Y)

0       5653753.0
1       6923131.0
2       2055379.0
3       2675511.0
4       4316715.0
5       5017319.0
6       5166635.0
7       4491607.0
8       4952497.0
9       5444227.0
10      3745135.0
11      5161370.0
12      1734634.0
13      4807746.0
14      1999097.0
15      3218918.0
16     19696939.0
17      8213524.0
18      5337526.0
19      2021934.0
20      5525735.0
21      1149870.0
22      3956086.0
23      2999068.0
24      8904084.0
25      3778621.0
26      2267425.0
27      5435276.0
28      4705945.0
29      3447890.0
          ...    
107     3248660.0
108     3570392.0
109     4219263.0
110     2954086.0
111     2993069.0
112     3784230.0
113     2097022.0
114     4155435.0
115     4882985.0
116     8894598.0
117     2018785.0
118     1847826.0
119     3780019.0
120     4067566.0
121     3445076.0
122     4286645.0
123     4263629.0
124     3810007.0
125     4780607.0
126     4015749.0
127     7592272.0
128     2383840.0
129     3939804.0
130     3376145.0
131     31

## Model生成

### ランダムフォレスト

In [75]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=150)
model.fit(X, Y)
model.score(X,Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_df.Id
test_predicted['Prediction'] = model.predict(test_df.drop('Id', axis=1))
test_predicted.to_csv('submissioncsv/submission-logreg.csv', index=False)


0.86839789616260776