# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, metrics, preprocessing

pd.options.display.max_columns = 1000

lbl = preprocessing.LabelEncoder()

In [2]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** .5

In [3]:
data = {
    'air_stores': pd.read_csv('dataset/air_store_info.csv'),
    'hpg_stores': pd.read_csv('dataset/hpg_store_info.csv'),

    'air_reserve': pd.read_csv('dataset/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('dataset/hpg_reserve.csv'),

    'air_hpd': pd.read_csv('dataset/store_id_relation.csv'),
    
    'dates': pd.read_csv('dataset/date_info.csv').rename(columns={
        'calendar_date': 'visit_date',
        'holiday_flg'  : 'holiday',
    })[['visit_date', 'holiday']],

    'train': pd.read_csv('dataset/air_visit_data.csv'),
    'test':  pd.read_csv('dataset/sample_submission.csv'),
}

## Обработка данных

### Обрабатываем `dates`

In [4]:
data['dates']['visit_date']    = pd.to_datetime(data['dates']['visit_date'])
data['dates']['visit_day']     = data['dates']['visit_date'].dt.day
data['dates']['visit_month']   = data['dates']['visit_date'].dt.month
data['dates']['visit_year']    = data['dates']['visit_date'].dt.year
data['dates']['visit_dow']     = data['dates']['visit_date'].dt.dayofweek
data['dates']['visit_work']    = lbl.fit_transform(data['dates']['visit_dow'] < 5)
data['dates']['visit_weekend'] = lbl.fit_transform(data['dates']['visit_work'] == False)
data['dates']['visit_int']     = data['dates']['visit_date'].dt.strftime('%Y%m%d').astype(int)
data['dates']['visit_date']    = data['dates']['visit_date'].dt.strftime('%Y-%m-%d')

# TODO: учитывать перенос рабочих дней во время праздников
# TODO: удалить праздничную неделю (из условия)

In [5]:
print('dates', data['dates'].shape)

data['dates'].head()

dates (517, 9)


Unnamed: 0,visit_date,holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_int
0,2016-01-01,1,1,1,2016,4,1,0,20160101
1,2016-01-02,1,2,1,2016,5,0,1,20160102
2,2016-01-03,1,3,1,2016,6,0,1,20160103
3,2016-01-04,0,4,1,2016,0,1,0,20160104
4,2016-01-05,0,5,1,2016,1,1,0,20160105


### Обрабатываем `train` и `test`

Приводим к единому формату

In [6]:
data['train']['id'] = data['train']['air_store_id'] + '_' + data['train']['visit_date']

data['test']['air_store_id'] = [id[:20] for id in data['test']['id']]
data['test']['visit_date']   = [id[21:] for id in data['test']['id']]

Дополнительные колонки даты

In [7]:
data['train'] = pd.merge(data['train'], data['dates'], how='left', on=['visit_date'])
data['test']  = pd.merge(data['test'],  data['dates'], how='left', on=['visit_date'])

In [8]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 12)
test  (32019, 12)


Unnamed: 0,air_store_id,visit_date,visitors,id,holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_int
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,1,0,20160113
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,1,0,20160114
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,1,0,20160115
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,0,1,20160116
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,1,0,20160118


### Формируем `reserve`

 Расширяем `air_reserve` данными из `hpg_reserve`

In [9]:
hpg_reserve_air = pd.merge(data['hpg_reserve'], data['air_hpd'], how='inner', on=['hpg_store_id'])
data['reserve'] = pd.concat([data['air_reserve'], hpg_reserve_air.drop('hpg_store_id', axis=1)])

In [10]:
print('air_reserve', data['air_reserve'].shape)
print('reserve    ', data['reserve'].shape)

air_reserve (92378, 4)
reserve     (120561, 4)


Дополнительные колонки даты в `reserve`

In [11]:
data['reserve']['visit_datetime']   = pd.to_datetime(data['reserve']['visit_datetime'])
data['reserve']['reserve_datetime'] = pd.to_datetime(data['reserve']['reserve_datetime'])

data['reserve']['visit_date'] = data['reserve']['visit_datetime'].dt.strftime('%Y-%m-%d')
    
data['reserve']['reserve_diff'] = data['reserve']['visit_datetime'] - data['reserve']['reserve_datetime']
data['reserve']['reserve_diff'] = data['reserve'].apply(lambda row: row['reserve_diff'].days, axis=1)

Фильтруем резервы, оформленные на той же неделе, что и посещение

In [12]:
# data['reserve'] = data['reserve'][data['reserve']['reserve_diff'] > data['reserve']['visit_datetime'].dt.dayofweek]

In [13]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 6)


Unnamed: 0,air_store_id,reserve_datetime,reserve_visitors,visit_datetime,visit_date,reserve_diff
0,air_877f79706adbfb06,2016-01-01 16:00:00,1,2016-01-01 19:00:00,2016-01-01,0
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,3,2016-01-01 19:00:00,2016-01-01,0
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,6,2016-01-01 19:00:00,2016-01-01,0
3,air_877f79706adbfb06,2016-01-01 16:00:00,2,2016-01-01 20:00:00,2016-01-01,0
4,air_db80363d35f10926,2016-01-01 01:00:00,5,2016-01-01 20:00:00,2016-01-01,0


Объединяем записи в `reserve`

In [14]:
reserve = data['reserve'][['air_store_id', 'visit_date']]
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['reserve']
            .groupby(['air_store_id', 'visit_date'], as_index=False)
            [['reserve_diff', 'reserve_visitors']],
        op_type,
    )().rename(columns={
        'reserve_diff':     'reserve_diff_'     + op_type,
        'reserve_visitors': 'reserve_visitors_' + op_type,
    })
    
    reserve = pd.merge(reserve, tmp, how='left', on=['air_store_id', 'visit_date'])
    
data['reserve'] = reserve

In [15]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 14)


Unnamed: 0,air_store_id,visit_date,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
1,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
2,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
3,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
4,air_db80363d35f10926,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,4,0,5,2,2


Добавляем `reserve` в `train` / `test` 

In [16]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['reserve'], how='left', on=['air_store_id', 'visit_date'])

In [17]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 24)
test  (32959, 24)


Unnamed: 0,air_store_id,visit_date,visitors,id,holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,1,0,20160113,,,,,,,,,,,,
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,1,0,20160114,,,,,,,,,,,,
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,1,0,20160115,,,,,,,,,,,,
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,0,1,20160116,,,,,,,,,,,,
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,1,0,20160118,,,,,,,,,,,,


### Формируем `stores`

In [18]:
stores_id = data['test']['air_store_id'].unique()
data['stores'] = pd.concat(
    [pd.DataFrame({
        'air_store_id':     stores_id,
        'air_store_id_int': lbl.fit_transform(stores_id),
        'visit_dow':        [i] * len(stores_id)
    }) for i in range(7)],
    axis=0,
    ignore_index=True,
).reset_index(drop=True)

In [19]:
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['train']
            .groupby(['air_store_id', 'visit_dow'], as_index=False)
            [['visitors']],
        op_type,
    )().rename(columns={
        'visitors': 'visitors_' + op_type,
    })
    
    data['stores'] = pd.merge(data['stores'], tmp, how='left', on=['air_store_id', 'visit_dow'])

In [20]:
data['stores'] = pd.merge(data['stores'], data['air_stores'], how='left', on=['air_store_id'])

In [21]:
data['stores']['air_genre_name'] = lbl.fit_transform(data['stores']['air_genre_name'])
data['stores']['air_area_name']  = lbl.fit_transform(data['stores']['air_area_name'])

Добавляем признаки на основе `latitude` и `longitude`

In [22]:
data['stores']['latitude_max']  = data['stores']['latitude'].max()  - data['stores']['latitude']
data['stores']['longitude_max'] = data['stores']['longitude'].max() - data['stores']['longitude']

data['stores']['latitude_longitude'] = data['stores']['latitude'] + data['stores']['longitude']

data['stores']['latitude_longitude_max'] = data['stores']['latitude_longitude'].max() - data['stores']['latitude_longitude']

In [23]:
print('stores', data['stores'].shape)

data['stores'].head()

stores (5747, 17)


Unnamed: 0,air_store_id,air_store_id_int,visit_dow,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_00a91d42b08b08d9,0,0,845.0,22.837838,20.0,1.0,47.0,37.0,6,44,35.694003,139.753595,8.326629,4.519803,175.447598,12.846432
1,air_0164b9927d20bcc3,1,0,329.0,9.969697,9.0,2.0,19.0,33.0,6,62,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
2,air_0241aa3964b7f861,2,0,562.0,8.920635,8.0,2.0,23.0,63.0,7,82,35.712607,139.779996,8.308025,4.493403,175.492603,12.801428
3,air_0328696196e46f18,3,0,77.0,6.416667,4.0,2.0,27.0,12.0,4,98,34.701279,135.52809,9.319353,8.745308,170.22937,18.064661
4,air_034a3d5b40d5b1b1,4,0,439.0,11.864865,10.0,1.0,66.0,37.0,2,102,34.692337,135.472229,9.328295,8.801169,170.164566,18.129464


### Объединияем `train` / `test` и `stores`

In [24]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['stores'], how='left', on=['air_store_id', 'visit_dow'])
    # Возможно, стоит использовать `median` 
    data[ds] = data[ds].fillna(data[ds].mean(numeric_only=True))

In [25]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 39)
test  (32959, 39)


Unnamed: 0,air_store_id,visit_date,visitors,id,holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count,air_store_id_int,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,1,0,20160113,277.779009,27.939524,6.562279,4.408958,5.365311,3.85591,1.930025,2.763425,14.224338,7.764623,6.633766,6.633766,599.0,1526.0,23.84375,25.0,7.0,57.0,64.0,4.0,62.0,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,1,0,20160114,277.779009,27.939524,6.562279,4.408958,5.365311,3.85591,1.930025,2.763425,14.224338,7.764623,6.633766,6.633766,599.0,1319.0,20.292308,21.0,2.0,54.0,65.0,4.0,62.0,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,1,0,20160115,277.779009,27.939524,6.562279,4.408958,5.365311,3.85591,1.930025,2.763425,14.224338,7.764623,6.633766,6.633766,599.0,2258.0,34.738462,35.0,4.0,61.0,65.0,4.0,62.0,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,0,1,20160116,277.779009,27.939524,6.562279,4.408958,5.365311,3.85591,1.930025,2.763425,14.224338,7.764623,6.633766,6.633766,599.0,1825.0,27.651515,27.0,6.0,53.0,66.0,4.0,62.0,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,1,0,20160118,277.779009,27.939524,6.562279,4.408958,5.365311,3.85591,1.930025,2.763425,14.224338,7.764623,6.633766,6.633766,599.0,784.0,13.754386,12.0,2.0,34.0,57.0,4.0,62.0,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363


## Обучение

In [26]:
columns = [col for col in data['train'] if col not in ['id', 'air_store_id', 'visit_date','visitors']]

In [27]:
model_gbr = ensemble.GradientBoostingRegressor(
    learning_rate=0.2,
    random_state=3,
    n_estimators=200,
    subsample=0.8,
    max_depth=10,
)

In [29]:
model_gbr.fit(data['train'][columns], np.log1p(data['train']['visitors'].values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=3, subsample=0.8, verbose=0,
             warm_start=False)

In [35]:
train_preds_gbr = model_gbr.predict(data['train'][columns])
print('RMSE GBR: ', RMSLE(np.log1p(data['train']['visitors'].values), train_preds_gbr))

RMSE GBR:  0.309632847224


In [38]:
test_preds_gbr = model_gbr.predict(data['test'][columns])
data['test']['visitors'] = np.expm1(test_preds_gbr).clip(min=0.)

In [46]:
test = data['test'][['id', 'visitors']]
test = test.groupby('id', as_index=False).mean()

In [47]:
test.to_csv('result/submission.csv', index=False)