# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd
import numpy as np

In [2]:
air_reserve = pd.read_csv('dataset/air_reserve.csv')
hpg_reserve = pd.read_csv('dataset/hpg_reserve.csv')
air_store_info = pd.read_csv('dataset/air_store_info.csv')
hpg_store_info = pd.read_csv('dataset/hpg_store_info.csv')
store_id_relation = pd.read_csv('dataset/store_id_relation.csv')
air_visit_data = pd.read_csv('dataset/air_visit_data.csv')
sample_submission = pd.read_csv('dataset/sample_submission.csv')
date_info = pd.read_csv('dataset/date_info.csv')

### Обработка `sample_submission`

In [3]:
# Приводим sample_submission к формату air_visit_data
sample_submission = sample_submission.assign(
    air_store_id = [id[:20] for id in sample_submission['id']],
    visit_date   = [id[21:] for id in sample_submission['id']],
)[['air_store_id', 'visit_date', 'visitors']]

In [4]:
sample_submission.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_00a91d42b08b08d9,2017-04-23,0
1,air_00a91d42b08b08d9,2017-04-24,0
2,air_00a91d42b08b08d9,2017-04-25,0
3,air_00a91d42b08b08d9,2017-04-26,0
4,air_00a91d42b08b08d9,2017-04-27,0


In [5]:
sample_submission['air_store_id'].nunique()

821

In [6]:
sample_submission['visit_date'].nunique()

39

In [7]:
len(sample_submission.index) == 821 * 39

True

In [8]:
air_store_info['air_store_id'].nunique()

829

In [9]:
air_visit_data['air_store_id'].nunique()

829

### Объединение датасета

In [10]:
# Получаем идентификаторы ресторанов в различных системах
air_store_ids = air_store_info[['air_store_id']]
hpg_store_ids = hpg_store_info[['hpg_store_id']]

# Строим 
goose_relation_air_hpg = \
    air_store_ids \
        .set_index('air_store_id') \
        .join(store_id_relation.set_index('air_store_id')) \
        .reset_index() \
        [['air_store_id', 'hpg_store_id']]
        
goose_relation_hpg_air = \
    hpg_store_ids \
        .set_index('hpg_store_id') \
        .join(store_id_relation.set_index('hpg_store_id')) \
        .reset_index() \
        [['air_store_id', 'hpg_store_id']]
        
goose_relation = goose_relation_air_hpg.merge(goose_relation_hpg_air, how='outer')

In [11]:
print('relations     ', len(store_id_relation))
print('air_to_hpg    ', len(goose_relation_air_hpg.dropna()))
print('hpg_to_air    ', len(goose_relation_hpg_air.dropna()))
print('goose_relation', len(goose_relation))
print('all_ids       ', len(air_store_ids) + len(hpg_store_ids))

relations      150
air_to_hpg     150
hpg_to_air     63
goose_relation 5456
all_ids        5519


In [12]:
air_store_info.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599


In [13]:
air_store_info_indexed = air_store_info.set_index('air_store_id')
hpg_store_info_indexed = hpg_store_info.set_index('hpg_store_id')

goose_store_info = goose_relation.copy()
for index, row in goose_store_info.iterrows():
    if row['air_store_id'] in air_store_info_indexed.index:
        goose_store_info.loc[index, 'air_genre_name'] = air_store_info_indexed.loc[row['air_store_id'], 'air_genre_name']
        goose_store_info.loc[index, 'air_area_name']  = air_store_info_indexed.loc[row['air_store_id'], 'air_area_name']
        goose_store_info.loc[index, 'air_latitude']   = air_store_info_indexed.loc[row['air_store_id'], 'latitude']
        goose_store_info.loc[index, 'air_longitude']  = air_store_info_indexed.loc[row['air_store_id'], 'longitude']
    if row['hpg_store_id'] in hpg_store_info_indexed.index:
        goose_store_info.loc[index, 'hpg_genre_name'] = hpg_store_info_indexed.loc[row['hpg_store_id'], 'hpg_genre_name']
        goose_store_info.loc[index, 'hpg_area_name']  = hpg_store_info_indexed.loc[row['hpg_store_id'], 'hpg_area_name']
        goose_store_info.loc[index, 'hpg_latitude']   = hpg_store_info_indexed.loc[row['hpg_store_id'], 'latitude']
        goose_store_info.loc[index, 'hpg_longitude']  = hpg_store_info_indexed.loc[row['hpg_store_id'], 'longitude']
        

In [14]:
goose_store_info.dropna().head()

Unnamed: 0,air_store_id,hpg_store_id,air_genre_name,air_area_name,air_latitude,air_longitude,hpg_genre_name,hpg_area_name,hpg_latitude,hpg_longitude
22,air_638c35eb25e53eea,hpg_f07ec7b288165b27,Italian/French,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,Spain Bar/Italian Bar,Fukuoka-ken Fukuoka-shi Daimyō,33.586969,130.392801
32,air_fcfbdcf7b1f82c6e,hpg_0b53e00789c2eafe,Italian/French,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Italian,Tōkyō-to Shibuya-ku None,35.659214,139.699736
33,air_f8233ad00755c35c,hpg_098e4dd30e54fee6,Italian/French,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Spain Bar/Italian Bar,Tōkyō-to Shibuya-ku None,35.659214,139.699736
53,air_082908692355165e,hpg_b8715d0ac52b1763,Italian/French,Tōkyō-to Shinjuku-ku Kabukichō,35.69384,139.703549,Spain Bar/Italian Bar,Tōkyō-to Shinjuku-ku None,35.691384,139.701256
63,air_48f4da6223571da4,hpg_832ba309e6699258,Italian/French,Tōkyō-to Tachikawa-shi Izumichō,35.714014,139.407843,Italian,Tōkyō-to Hachiōji-shi Ishikawamachi,35.677207,139.37387
