In [6]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [7]:
complete_train_data = pd.read_csv('data/processed/train_proc_2.csv')
complete_test_data = pd.read_csv('data/processed/test_proc_2.csv')

In [8]:
complete_test_data.head()

Unnamed: 0,air_store_id,visit_date,reserve_visitors,calendar_date,day_of_week,holiday_flg,air_genre_name,air_area_name
0,air_00a91d42b08b08d9,2017-04-23,0.0,2017-04-23,Sunday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami
1,air_0164b9927d20bcc3,2017-04-23,0.0,2017-04-23,Sunday,0,Italian/French,Tōkyō-to Minato-ku Shibakōen
2,air_0241aa3964b7f861,2017-04-23,0.0,2017-04-23,Sunday,0,Izakaya,Tōkyō-to Taitō-ku Higashiueno
3,air_0328696196e46f18,2017-04-23,0.0,2017-04-23,Sunday,0,Dining bar,Ōsaka-fu Ōsaka-shi Nakanochō
4,air_034a3d5b40d5b1b1,2017-04-23,0.0,2017-04-23,Sunday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki


<h2>Feature Engineering</h2>
<p>1. Features from Date (Day of week, month, day of month etc)</p>
<h4><b>TODO</b></h4>
<p>1. Count of Reserved visitors for the nearby restaurant with same genre (TODO - need to add preprocessing for this step)</p>

In [9]:
def create_features(complete_df):
    complete_df['visit_date'] = pd.to_datetime(complete_df['visit_date'])
    complete_df['month'] = complete_df['visit_date'].dt.month
    complete_df['day'] = complete_df['visit_date'].dt.day

create_features(complete_train_data)
create_features(complete_test_data)

In [10]:
week_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 
            'Saturday': 5, 'Sunday': 6}
genre_encoder = LabelEncoder()
area_encoder = LabelEncoder()

def train_preprocessors(complete_df):
    global genre_encoder
    global area_encoder
    genre_encoder.fit(complete_df.air_genre_name)
    area_encoder.fit(complete_df.air_area_name)
    
def convert_feature_types(complete_df):
    complete_df.day_of_week = complete_df.day_of_week.map(week_map)
    global genre_encoder
    global area_encoder
    complete_df.air_genre_name = genre_encoder.transform(complete_df.air_genre_name)
    complete_df.air_area_name = area_encoder.transform(complete_df.air_area_name)

train_preprocessors(complete_train_data)
convert_feature_types(complete_train_data)
convert_feature_types(complete_test_data)

In [13]:
complete_train_data.head()

Unnamed: 0,air_store_id,visit_date,visitors,reserve_visitors,calendar_date,day_of_week,holiday_flg,air_genre_name,air_area_name,month,day
0,air_00a91d42b08b08d9,2016-07-01,35,1.0,2016-07-01,4,0,6,44,7,1
1,air_0241aa3964b7f861,2016-07-01,10,0.0,2016-07-01,4,0,7,82,7,1
2,air_034a3d5b40d5b1b1,2016-07-01,19,0.0,2016-07-01,4,0,2,102,7,1
3,air_036d4f1ee7285390,2016-07-01,37,0.0,2016-07-01,4,0,2,31,7,1
4,air_03963426c9312048,2016-07-01,55,0.0,2016-07-01,4,0,7,15,7,1


## Training

In [14]:
independent_cols = ['reserve_visitors', 'day_of_week', 'holiday_flg', 'month', 'day', 
                    'air_genre_name', 'air_area_name']
train_x = complete_train_data[independent_cols]
train_y = complete_train_data.visitors

rf_reg = RandomForestRegressor()
rf_reg.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## Create Submission File

In [15]:
test_x = complete_test_data[independent_cols]
test_x['visitors'] = rf_reg.predict(test_x)

submission_file = pd.DataFrame()
submission_file['air_store_id'] = complete_test_data['air_store_id']
submission_file['visitors'] = test_x['visitors']
submission_file['date'] = pd.to_datetime(complete_test_data['visit_date']).dt.date
submission_file['id'] = submission_file['air_store_id'].map(str) + '_' + submission_file['date'].map(str)
submission_file = submission_file[['id', 'visitors']]
submission_file.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,14.133333
1,air_0164b9927d20bcc3_2017-04-23,31.25
2,air_0241aa3964b7f861_2017-04-23,21.4
3,air_0328696196e46f18_2017-04-23,15.4
4,air_034a3d5b40d5b1b1_2017-04-23,30.158333


In [16]:
submission_file.to_csv('data/processed/submission_2.csv', index=False)