# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn import metrics
import catboost

pd.options.display.max_columns = 1000

%matplotlib inline

In [2]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** .5

In [3]:
data = {
    'stores':  pd.read_csv('result/stores.csv'),
    'reserve': pd.read_csv('result/reserve.csv'),
    'dates':   pd.read_csv('result/dates.csv'),

    'train': pd.read_csv('dataset/air_visit_data.csv'),
    'test':  pd.read_csv('dataset/sample_submission.csv'),
}

## Обработка данных

### Обрабатываем `train` и `test`

Приводим к единому формату

In [4]:
data['train']['id'] = data['train']['air_store_id'] + '_' + data['train']['visit_date']

data['test']['air_store_id'] = [id[:20] for id in data['test']['id']]
data['test']['visit_date']   = [id[21:] for id in data['test']['id']]

Добавляем дополнительные колонки даты

In [5]:
data['train'] = pd.merge(data['train'], data['dates'], how='left', on=['visit_date'])
data['test']  = pd.merge(data['test'],  data['dates'], how='left', on=['visit_date'])

In [6]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 12)
test  (32019, 12)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118


### Обрабатываем `reserve`

Добавляем `reserve` в `train` / `test` 

In [7]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['reserve'], how='left', on=['air_store_id', 'visit_date'])

In [8]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 14)
test  (32019, 14)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_visitors,reserve_visitors_competitor
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,0,7438
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,0,8052
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,0,21468
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,0,19082
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,0,6035


### Обрабатываем `stores`

Добавляем информацию о посещениях

In [9]:
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['train']
            .groupby(['air_store_id', 'visit_dow'], as_index=False)
            [['visitors']],
        op_type,
    )().rename(columns={
        'visitors': 'visitors_' + op_type,
    })
    
    data['stores'] = pd.merge(data['stores'], tmp, how='left', on=['air_store_id', 'visit_dow'])

In [10]:
print('stores', data['stores'].shape)

data['stores'].head()

stores (5803, 13)


Unnamed: 0,air_store_id,visit_dow,air_genre_name,air_area_name,latitude,longitude,city,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count
0,air_0f0cdeee6c9bf3d7,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Tokyo,609.0,16.916667,15.0,2.0,39.0,36.0
1,air_7cc17a324ae5c7dc,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Tokyo,219.0,9.125,7.0,1.0,34.0,24.0
2,air_fee8dcf4d619598e,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Tokyo,813.0,20.325,18.5,7.0,39.0,40.0
3,air_a17f0778617c76e2,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Tokyo,158.0,26.333333,27.5,19.0,31.0,6.0
4,air_83db5aff8f50478e,0,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,202.0,6.516129,6.0,1.0,13.0,31.0


Объединияем `train` / `test` и `stores`

In [11]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['stores'], how='left', on=['air_store_id', 'visit_dow'])

In [12]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 25)
test  (32019, 25)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_visitors,reserve_visitors_competitor,air_genre_name,air_area_name,latitude,longitude,city,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,0,7438,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,1526.0,23.84375,25.0,7.0,57.0,64.0
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,0,8052,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,1319.0,20.292308,21.0,2.0,54.0,65.0
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,0,21468,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,2258.0,34.738462,35.0,4.0,61.0,65.0
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,0,19082,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,1825.0,27.651515,27.0,6.0,53.0,66.0
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,0,6035,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tokyo,784.0,13.754386,12.0,2.0,34.0,57.0


## Обучение

In [13]:
columns = pd.Series([col for col in data['train'] if col not in ['id', 'visit_date', 'visitors']])
cat_features = columns[columns.isin([
    'air_store_id',
    'visit_holiday',
    'visit_day',
    'visit_month',
    'visit_year',
    'visit_dow',
    'visit_work',
    'visit_weekend',
    'air_genre_name',
    'air_area_name',
    'city',
])].index

In [14]:
train_pool = catboost.Pool(
    data['train'][columns],
    np.log1p(data['train']['visitors'].values),
    cat_features=cat_features.tolist(),
)

test_pool = catboost.Pool(
    data['train'][columns],
    cat_features=cat_features.tolist(),
)

In [15]:
model = catboost.CatBoostRegressor(
     iterations=1000,
)

model.fit(train_pool)

0:	learn: 2.8323858	total: 273ms	remaining: 4m 32s
1:	learn: 2.7505254	total: 464ms	remaining: 3m 51s
2:	learn: 2.6712147	total: 658ms	remaining: 3m 38s
3:	learn: 2.5943525	total: 822ms	remaining: 3m 24s
4:	learn: 2.5198742	total: 1.02s	remaining: 3m 22s
5:	learn: 2.4477305	total: 1.18s	remaining: 3m 14s
6:	learn: 2.3778385	total: 1.35s	remaining: 3m 12s
7:	learn: 2.3101267	total: 1.54s	remaining: 3m 11s
8:	learn: 2.2445499	total: 1.74s	remaining: 3m 11s
9:	learn: 2.1810304	total: 1.97s	remaining: 3m 14s
10:	learn: 2.1195184	total: 2.2s	remaining: 3m 17s
11:	learn: 2.0599640	total: 2.37s	remaining: 3m 15s
12:	learn: 2.0022913	total: 2.56s	remaining: 3m 14s
13:	learn: 1.9464584	total: 2.73s	remaining: 3m 12s
14:	learn: 1.8924099	total: 2.93s	remaining: 3m 12s
15:	learn: 1.8401023	total: 3.1s	remaining: 3m 10s
16:	learn: 1.7894861	total: 3.28s	remaining: 3m 9s
17:	learn: 1.7404998	total: 3.47s	remaining: 3m 9s
18:	learn: 1.6931094	total: 3.64s	remaining: 3m 8s
19:	learn: 1.6472601	total:

158:	learn: 0.5061648	total: 38s	remaining: 3m 21s
159:	learn: 0.5060914	total: 38.2s	remaining: 3m 20s
160:	learn: 0.5059574	total: 38.6s	remaining: 3m 21s
161:	learn: 0.5057874	total: 38.9s	remaining: 3m 21s
162:	learn: 0.5056207	total: 39.2s	remaining: 3m 21s
163:	learn: 0.5054836	total: 39.4s	remaining: 3m 20s
164:	learn: 0.5053417	total: 39.9s	remaining: 3m 21s
165:	learn: 0.5052135	total: 40.3s	remaining: 3m 22s
166:	learn: 0.5051286	total: 40.6s	remaining: 3m 22s
167:	learn: 0.5050623	total: 40.9s	remaining: 3m 22s
168:	learn: 0.5049224	total: 41.1s	remaining: 3m 22s
169:	learn: 0.5048479	total: 41.4s	remaining: 3m 22s
170:	learn: 0.5047151	total: 42s	remaining: 3m 23s
171:	learn: 0.5046082	total: 42.4s	remaining: 3m 23s
172:	learn: 0.5045248	total: 42.7s	remaining: 3m 24s
173:	learn: 0.5044495	total: 43.1s	remaining: 3m 24s
174:	learn: 0.5043845	total: 43.5s	remaining: 3m 25s
175:	learn: 0.5042691	total: 43.8s	remaining: 3m 25s
176:	learn: 0.5041895	total: 44.1s	remaining: 3m 2

314:	learn: 0.4971056	total: 1m 21s	remaining: 2m 56s
315:	learn: 0.4970932	total: 1m 21s	remaining: 2m 56s
316:	learn: 0.4970471	total: 1m 21s	remaining: 2m 56s
317:	learn: 0.4970055	total: 1m 21s	remaining: 2m 55s
318:	learn: 0.4969558	total: 1m 22s	remaining: 2m 55s
319:	learn: 0.4969091	total: 1m 22s	remaining: 2m 54s
320:	learn: 0.4968823	total: 1m 22s	remaining: 2m 54s
321:	learn: 0.4968664	total: 1m 22s	remaining: 2m 54s
322:	learn: 0.4968364	total: 1m 23s	remaining: 2m 54s
323:	learn: 0.4967834	total: 1m 23s	remaining: 2m 54s
324:	learn: 0.4967586	total: 1m 24s	remaining: 2m 54s
325:	learn: 0.4967131	total: 1m 24s	remaining: 2m 54s
326:	learn: 0.4966646	total: 1m 24s	remaining: 2m 54s
327:	learn: 0.4966526	total: 1m 25s	remaining: 2m 54s
328:	learn: 0.4966449	total: 1m 25s	remaining: 2m 54s
329:	learn: 0.4966020	total: 1m 25s	remaining: 2m 54s
330:	learn: 0.4965770	total: 1m 26s	remaining: 2m 54s
331:	learn: 0.4965629	total: 1m 26s	remaining: 2m 54s
332:	learn: 0.4965324	total:

466:	learn: 0.4925986	total: 2m	remaining: 2m 17s
467:	learn: 0.4925763	total: 2m	remaining: 2m 17s
468:	learn: 0.4925535	total: 2m	remaining: 2m 16s
469:	learn: 0.4924847	total: 2m 1s	remaining: 2m 16s
470:	learn: 0.4924561	total: 2m 1s	remaining: 2m 16s
471:	learn: 0.4924296	total: 2m 1s	remaining: 2m 16s
472:	learn: 0.4923832	total: 2m 1s	remaining: 2m 15s
473:	learn: 0.4923695	total: 2m 2s	remaining: 2m 15s
474:	learn: 0.4923341	total: 2m 2s	remaining: 2m 15s
475:	learn: 0.4923273	total: 2m 2s	remaining: 2m 15s
476:	learn: 0.4923027	total: 2m 2s	remaining: 2m 14s
477:	learn: 0.4922723	total: 2m 3s	remaining: 2m 14s
478:	learn: 0.4922415	total: 2m 3s	remaining: 2m 14s
479:	learn: 0.4922327	total: 2m 3s	remaining: 2m 14s
480:	learn: 0.4921866	total: 2m 3s	remaining: 2m 13s
481:	learn: 0.4921676	total: 2m 4s	remaining: 2m 13s
482:	learn: 0.4921518	total: 2m 4s	remaining: 2m 13s
483:	learn: 0.4921291	total: 2m 4s	remaining: 2m 13s
484:	learn: 0.4921087	total: 2m 5s	remaining: 2m 12s
48

620:	learn: 0.4894907	total: 2m 37s	remaining: 1m 36s
621:	learn: 0.4894500	total: 2m 38s	remaining: 1m 36s
622:	learn: 0.4894400	total: 2m 38s	remaining: 1m 35s
623:	learn: 0.4893989	total: 2m 38s	remaining: 1m 35s
624:	learn: 0.4893838	total: 2m 39s	remaining: 1m 35s
625:	learn: 0.4893566	total: 2m 39s	remaining: 1m 35s
626:	learn: 0.4893451	total: 2m 39s	remaining: 1m 34s
627:	learn: 0.4893362	total: 2m 39s	remaining: 1m 34s
628:	learn: 0.4893327	total: 2m 40s	remaining: 1m 34s
629:	learn: 0.4893288	total: 2m 40s	remaining: 1m 34s
630:	learn: 0.4893216	total: 2m 40s	remaining: 1m 34s
631:	learn: 0.4893002	total: 2m 41s	remaining: 1m 33s
632:	learn: 0.4892901	total: 2m 41s	remaining: 1m 33s
633:	learn: 0.4892827	total: 2m 41s	remaining: 1m 33s
634:	learn: 0.4892708	total: 2m 42s	remaining: 1m 33s
635:	learn: 0.4892607	total: 2m 42s	remaining: 1m 33s
636:	learn: 0.4892502	total: 2m 43s	remaining: 1m 32s
637:	learn: 0.4892400	total: 2m 43s	remaining: 1m 32s
638:	learn: 0.4892368	total:

774:	learn: 0.4874616	total: 3m 21s	remaining: 58.5s
775:	learn: 0.4874555	total: 3m 21s	remaining: 58.2s
776:	learn: 0.4874533	total: 3m 21s	remaining: 58s
777:	learn: 0.4874517	total: 3m 22s	remaining: 57.7s
778:	learn: 0.4874396	total: 3m 22s	remaining: 57.5s
779:	learn: 0.4874195	total: 3m 22s	remaining: 57.2s
780:	learn: 0.4874099	total: 3m 23s	remaining: 56.9s
781:	learn: 0.4874015	total: 3m 23s	remaining: 56.7s
782:	learn: 0.4873580	total: 3m 23s	remaining: 56.4s
783:	learn: 0.4873289	total: 3m 23s	remaining: 56.1s
784:	learn: 0.4873144	total: 3m 23s	remaining: 55.9s
785:	learn: 0.4873107	total: 3m 24s	remaining: 55.6s
786:	learn: 0.4872996	total: 3m 24s	remaining: 55.3s
787:	learn: 0.4872957	total: 3m 24s	remaining: 55.1s
788:	learn: 0.4872539	total: 3m 24s	remaining: 54.8s
789:	learn: 0.4872522	total: 3m 25s	remaining: 54.6s
790:	learn: 0.4872473	total: 3m 25s	remaining: 54.3s
791:	learn: 0.4872427	total: 3m 25s	remaining: 54s
792:	learn: 0.4872324	total: 3m 26s	remaining: 53.

930:	learn: 0.4857255	total: 3m 59s	remaining: 17.7s
931:	learn: 0.4857235	total: 3m 59s	remaining: 17.5s
932:	learn: 0.4857062	total: 3m 59s	remaining: 17.2s
933:	learn: 0.4856987	total: 3m 59s	remaining: 16.9s
934:	learn: 0.4856871	total: 3m 59s	remaining: 16.7s
935:	learn: 0.4856741	total: 4m	remaining: 16.4s
936:	learn: 0.4856670	total: 4m	remaining: 16.2s
937:	learn: 0.4856503	total: 4m	remaining: 15.9s
938:	learn: 0.4856421	total: 4m	remaining: 15.6s
939:	learn: 0.4856055	total: 4m 1s	remaining: 15.4s
940:	learn: 0.4855900	total: 4m 1s	remaining: 15.1s
941:	learn: 0.4855893	total: 4m 1s	remaining: 14.9s
942:	learn: 0.4855870	total: 4m 1s	remaining: 14.6s
943:	learn: 0.4855768	total: 4m 2s	remaining: 14.4s
944:	learn: 0.4855751	total: 4m 2s	remaining: 14.1s
945:	learn: 0.4855643	total: 4m 2s	remaining: 13.8s
946:	learn: 0.4855310	total: 4m 2s	remaining: 13.6s
947:	learn: 0.4855221	total: 4m 3s	remaining: 13.3s
948:	learn: 0.4855199	total: 4m 3s	remaining: 13.1s
949:	learn: 0.48551

<catboost.core.CatBoostRegressor at 0x7fe4d5356748>

In [16]:
train_preds = model.predict(train_pool)
print('RMSLE:', RMSLE(np.log1p(data['train']['visitors'].values), train_preds))

RMSLE: 0.47887953168


In [17]:
test_preds = model.predict(test_pool)
data['test']['visitors'] = np.expm1(test_preds).clip(min=0.)

In [18]:
test = data['test'][['id', 'visitors']]
test = test.groupby('id', as_index=False).mean()

In [19]:
test.to_csv('result/submission.csv', index=False)