In [1]:
# S3 prefix
s3_bucket = 'recruit-restaurant-visitor-forecasting-on-sagemaker'
prefix = 'xgboost-example'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [2]:
WORK_DIRECTORY = 'input'

train_input = sagemaker_session.upload_data(
    path='{}'.format(WORK_DIRECTORY), 
    bucket=s3_bucket,
    key_prefix='{}/{}'.format(prefix, 'train'))

In [3]:
import pandas as pd

air_visit = pd.read_csv('input_small/air_visit_data.csv')
air_visit.index = pd.to_datetime(air_visit['visit_date'])
air_visit = air_visit.groupby('air_store_id').apply(lambda g: g['visitors'].resample('1d').sum()).reset_index()
air_visit['visit_date'] = air_visit['visit_date'].dt.strftime('%Y-%m-%d')
air_visit['was_nil'] = air_visit['visitors'].isnull()
air_visit['visitors'].fillna(0, inplace=True)

air_visit.head()

Unnamed: 0,air_store_id,visit_date,visitors,was_nil
0,air_25e9888d30b386df,2016-01-05,12,False
1,air_25e9888d30b386df,2016-01-06,28,False
2,air_25e9888d30b386df,2016-01-07,6,False
3,air_25e9888d30b386df,2016-01-08,8,False
4,air_25e9888d30b386df,2016-01-09,13,False


In [4]:
date_info = pd.read_csv('input/date_info.csv')
date_info.rename(columns={'holiday_flg': 'is_holiday', 'calendar_date': 'visit_date'}, inplace=True)
date_info['prev_day_is_holiday'] = date_info['is_holiday'].shift().fillna(0)
date_info['next_day_is_holiday'] = date_info['is_holiday'].shift(-1).fillna(0)

date_info.head()

Unnamed: 0,visit_date,day_of_week,is_holiday,prev_day_is_holiday,next_day_is_holiday
0,2016-01-01,Friday,1,0.0,1.0
1,2016-01-02,Saturday,1,1.0,1.0
2,2016-01-03,Sunday,1,1.0,0.0
3,2016-01-04,Monday,0,1.0,0.0
4,2016-01-05,Tuesday,0,0.0,0.0


In [5]:
air_store_info = pd.read_csv('input/rrv-weather-data/air_store_info_with_nearest_active_station.csv')

air_store_info.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude,latitude_str,longitude_str,station_id,station_latitude,station_longitude,station_vincenty,station_great_circle
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,"""34.6951242""","""135.1978525""",hyogo__kobe-kana__koube,34.696667,135.211667,1.277232,1.274882
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,"""34.6951242""","""135.1978525""",hyogo__kobe-kana__koube,34.696667,135.211667,1.277232,1.274882
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,"""34.6951242""","""135.1978525""",hyogo__kobe-kana__koube,34.696667,135.211667,1.277232,1.274882
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,"""34.6951242""","""135.1978525""",hyogo__kobe-kana__koube,34.696667,135.211667,1.277232,1.274882
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,"""35.6580681""","""139.7515992""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,3.730672,3.739835


In [6]:
import numpy as np

submission = pd.read_csv('input/sample_submission.csv')
submission['air_store_id'] = submission['id'].str.slice(0, 20)
submission['visit_date'] = submission['id'].str.slice(21)
submission['is_test'] = True
submission['visitors'] = np.nan
submission['test_number'] = range(len(submission))

submission.head()

Unnamed: 0,id,visitors,air_store_id,visit_date,is_test,test_number
0,air_00a91d42b08b08d9_2017-04-23,,air_00a91d42b08b08d9,2017-04-23,True,0
1,air_00a91d42b08b08d9_2017-04-24,,air_00a91d42b08b08d9,2017-04-24,True,1
2,air_00a91d42b08b08d9_2017-04-25,,air_00a91d42b08b08d9,2017-04-25,True,2
3,air_00a91d42b08b08d9_2017-04-26,,air_00a91d42b08b08d9,2017-04-26,True,3
4,air_00a91d42b08b08d9_2017-04-27,,air_00a91d42b08b08d9,2017-04-27,True,4


In [7]:
data = pd.concat((air_visit, submission.drop('id', axis='columns')))
data['is_test'].fillna(False, inplace=True)
data = pd.merge(left=data, right=date_info, on='visit_date', how='left', sort=True)
data = pd.merge(left=data, right=air_store_info, on='air_store_id', how='left', sort=True)
data['visitors'] = data['visitors'].astype(float)

data.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


Unnamed: 0,air_store_id,is_test,test_number,visit_date,visitors,was_nil,day_of_week,is_holiday,prev_day_is_holiday,next_day_is_holiday,...,air_area_name,latitude,longitude,latitude_str,longitude_str,station_id,station_latitude,station_longitude,station_vincenty,station_great_circle
0,air_00a91d42b08b08d9,True,0.0,2017-04-23,,,Sunday,0,0.0,0.0,...,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,"""35.6940027""","""139.7535951""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,0.416011,0.415906
1,air_00a91d42b08b08d9,True,1.0,2017-04-24,,,Monday,0,0.0,0.0,...,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,"""35.6940027""","""139.7535951""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,0.416011,0.415906
2,air_00a91d42b08b08d9,True,2.0,2017-04-25,,,Tuesday,0,0.0,0.0,...,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,"""35.6940027""","""139.7535951""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,0.416011,0.415906
3,air_00a91d42b08b08d9,True,3.0,2017-04-26,,,Wednesday,0,0.0,0.0,...,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,"""35.6940027""","""139.7535951""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,0.416011,0.415906
4,air_00a91d42b08b08d9,True,4.0,2017-04-27,,,Thursday,0,0.0,0.0,...,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,"""35.6940027""","""139.7535951""",tokyo__tokyo-kana__tonokyo,35.691667,139.75,0.416011,0.415906


In [8]:
import glob

weather_dfs = []

for path in glob.glob('input/rrv-weather-data/1-1-16_5-31-17_Weather/*.csv'):
    weather_df = pd.read_csv(path)
    weather_df['station_id'] = path.split('\\')[-1].rstrip('.csv')
    weather_dfs.append(weather_df)

weather = pd.concat(weather_dfs, axis='rows')
weather.rename(columns={'calendar_date': 'visit_date'}, inplace=True)

means = weather.groupby('visit_date')[['avg_temperature', 'precipitation']].mean().reset_index()
means.rename(columns={'avg_temperature': 'global_avg_temperature', 'precipitation': 'global_precipitation'}, inplace=True)
weather = pd.merge(left=weather, right=means, on='visit_date', how='left')
weather['avg_temperature'].fillna(weather['global_avg_temperature'], inplace=True)
weather['precipitation'].fillna(weather['global_precipitation'], inplace=True)

weather[['visit_date', 'avg_temperature', 'precipitation']].head()

Unnamed: 0,visit_date,avg_temperature,precipitation
0,2016-01-01,8.6,0.0
1,2016-01-02,10.3,0.0
2,2016-01-03,12.1,0.0
3,2016-01-04,13.1,0.0
4,2016-01-05,10.7,0.0


In [9]:
data['visit_date'] = pd.to_datetime(data['visit_date'])
data.sort_values(['visit_date', 'air_store_id'], inplace=True)
data.index = data['visit_date']

In [10]:
data.to_csv('input/data.csv', index=False, header=True)

## Preprocessing

In [11]:
data2 = pd.read_csv('input/data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
data

Unnamed: 0,visit_date,air_store_id,is_test,test_number,visit_date.1,visitors,was_nil,day_of_week,is_holiday,prev_day_is_holiday,...,air_area_name,latitude,longitude,latitude_str,longitude_str,station_id,station_latitude,station_longitude,station_vincenty,station_great_circle
0,2016-01-04,air_9438d67241c81314,False,,2016-01-04,34.0,False,Monday,0,1.0,...,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,"""33.5892157""","""130.3928134""",fukuoka__fukuoka-kana__fukuoka,33.581667,130.375000,1.853535,1.851854
1,2016-01-04,air_ee3a01f0c71a769f,False,,2016-01-04,61.0,False,Monday,0,1.0,...,Shizuoka-ken Hamamatsu-shi Motoshirochō,34.710895,137.725940,"""34.7108955""","""137.7259397""",shizuoka__hamamatsu-kana__hamamatsu,34.753333,137.711667,4.885979,4.897191
2,2016-01-04,air_fd6aac1043520e83,False,,2016-01-04,28.0,False,Monday,0,1.0,...,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,"""35.6580681""","""139.7515992""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,3.730672,3.739835
3,2016-01-05,air_25e9888d30b386df,False,,2016-01-05,12.0,False,Tuesday,0,0.0,...,Tōkyō-to Shinagawa-ku Higashigotanda,35.626568,139.725858,"""35.6265683""","""139.7258581""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,7.546405,7.562205
4,2016-01-05,air_64d4491ad8cdb1c6,False,,2016-01-05,9.0,False,Tuesday,0,0.0,...,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,"""35.6580681""","""139.7515992""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,3.730672,3.739835
5,2016-01-05,air_9438d67241c81314,False,,2016-01-05,25.0,False,Tuesday,0,0.0,...,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,"""33.5892157""","""130.3928134""",fukuoka__fukuoka-kana__fukuoka,33.581667,130.375000,1.853535,1.851854
6,2016-01-05,air_ee3a01f0c71a769f,False,,2016-01-05,25.0,False,Tuesday,0,0.0,...,Shizuoka-ken Hamamatsu-shi Motoshirochō,34.710895,137.725940,"""34.7108955""","""137.7259397""",shizuoka__hamamatsu-kana__hamamatsu,34.753333,137.711667,4.885979,4.897191
7,2016-01-05,air_fd6aac1043520e83,False,,2016-01-05,36.0,False,Tuesday,0,0.0,...,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,"""35.6580681""","""139.7515992""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,3.730672,3.739835
8,2016-01-06,air_25e9888d30b386df,False,,2016-01-06,28.0,False,Wednesday,0,0.0,...,Tōkyō-to Shinagawa-ku Higashigotanda,35.626568,139.725858,"""35.6265683""","""139.7258581""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,7.546405,7.562205
9,2016-01-06,air_64d4491ad8cdb1c6,False,,2016-01-06,15.0,False,Wednesday,0,0.0,...,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,"""35.6580681""","""139.7515992""",tokyo__tokyo-kana__tonokyo,35.691667,139.750000,3.730672,3.739835
