# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd

In [2]:
data = {
    'air_store': pd.read_csv('dataset/air_store_info.csv'),
    'hpg_store': pd.read_csv('dataset/hpg_store_info.csv'),

    'air_reserve': pd.read_csv('dataset/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('dataset/hpg_reserve.csv'),

    'air_hpd': pd.read_csv('dataset/store_id_relation.csv'),
    
    'date': pd.read_csv('dataset/date_info.csv').rename(columns={'calendar_date': 'visit_date'}),

    'train': pd.read_csv('dataset/air_visit_data.csv'),
    'test':  pd.read_csv('dataset/sample_submission.csv'),
}

### Обработка данных

Приводим `test` к формату `train`

In [3]:
data['test']['air_store_id'] = [id[:20] for id in data['test']['id']]
data['test']['visit_date']   = [id[21:] for id in data['test']['id']]

Дополнительные колонки даты в `train` и `test`

In [4]:
for ds in ['train', 'test']:
    data[ds]['visit_date']  = pd.to_datetime(data[ds]['visit_date'])
    data[ds]['visit_day']   = data[ds]['visit_date'].dt.day
    data[ds]['visit_month'] = data[ds]['visit_date'].dt.month
    data[ds]['visit_year']  = data[ds]['visit_date'].dt.year
    data[ds]['visit_dow']   = data[ds]['visit_date'].dt.dayofweek

In [5]:
data['train'].head()

Unnamed: 0,air_store_id,visit_date,visitors,visit_day,visit_month,visit_year,visit_dow
0,air_ba937bf13d40fb24,2016-01-13,25,13,1,2016,2
1,air_ba937bf13d40fb24,2016-01-14,32,14,1,2016,3
2,air_ba937bf13d40fb24,2016-01-15,29,15,1,2016,4
3,air_ba937bf13d40fb24,2016-01-16,22,16,1,2016,5
4,air_ba937bf13d40fb24,2016-01-18,6,18,1,2016,0


 Расширяем `air_reserve` данными из `hpg_reserve`

In [6]:
data['hpg_reserve_air'] = pd.merge(data['hpg_reserve'], data['air_hpd'], how='inner', on=['hpg_store_id'])
data['air_reserve_ext'] = pd.concat([data['air_reserve'], data['hpg_reserve_air'].drop('hpg_store_id', axis=1)])

In [7]:
print('air_reserve    ', len(data['air_reserve']))
print('air_reserve_ext', len(data['air_reserve_ext']))

air_reserve     92378
air_reserve_ext 120561


Дополнительные колонки даты в `air_reserve_ext`

In [8]:
for dt_type in ['visit', 'reserve']:
    data['air_reserve_ext'][dt_type + '_datetime'] = pd.to_datetime(data['air_reserve_ext'][dt_type + '_datetime'])
    data['air_reserve_ext'][dt_type + '_day']      = data['air_reserve_ext'][dt_type + '_datetime'].dt.day
    data['air_reserve_ext'][dt_type + '_month']    = data['air_reserve_ext'][dt_type + '_datetime'].dt.month
    data['air_reserve_ext'][dt_type + '_year']     = data['air_reserve_ext'][dt_type + '_datetime'].dt.year
    data['air_reserve_ext'][dt_type + '_dow']      = data['air_reserve_ext'][dt_type + '_datetime'].dt.dayofweek
    
data['air_reserve_ext']['reserve_diff'] = data['air_reserve_ext']['visit_datetime'] - data['air_reserve_ext']['reserve_datetime']
data['air_reserve_ext']['reserve_diff'] = data['air_reserve_ext'].apply(lambda row: row['reserve_diff'].days, axis=1)

data['air_reserve_ext']['same_week'] = data['air_reserve_ext'].apply(lambda row: row['reserve_diff'] <= row['visit_dow'], axis=1)