# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
import catboost

pd.options.display.max_columns = 1000

lbl = preprocessing.LabelEncoder()

In [2]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** .5

In [3]:
data = {
    'air_stores': pd.read_csv('dataset/air_store_info.csv'),
    'hpg_stores': pd.read_csv('dataset/hpg_store_info.csv'),

    'air_reserve': pd.read_csv('dataset/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('dataset/hpg_reserve.csv'),

    'air_hpd': pd.read_csv('dataset/store_id_relation.csv'),
    
    'dates': pd.read_csv('dataset/date_info.csv').rename(columns={
        'calendar_date': 'visit_date',
        'holiday_flg'  : 'visit_holiday',
    })[['visit_date', 'visit_holiday']],

    'train': pd.read_csv('dataset/air_visit_data.csv'),
    'test':  pd.read_csv('dataset/sample_submission.csv'),
}

## Обработка данных

### Обрабатываем `dates`

In [4]:
holydays = np.array([1, 2, 1, 1, 1, 3, 4])

data['dates']['visit_date']     = pd.to_datetime(data['dates']['visit_date'])
data['dates']['visit_day']      = data['dates']['visit_date'].dt.day
data['dates']['visit_month']    = data['dates']['visit_date'].dt.month
data['dates']['visit_year']     = data['dates']['visit_date'].dt.year
data['dates']['visit_dow']      = data['dates']['visit_date'].dt.dayofweek
data['dates']['visit_work']     = data['dates']['visit_dow'] < 5
data['dates']['visit_weekend']  = data['dates']['visit_work'] == False
data['dates']['visit_date_int'] = data['dates']['visit_date'].dt.strftime('%Y%m%d').astype(int)
data['dates']['visit_holiday']  = data['dates'].apply(lambda row: holydays[row['visit_dow']] if row['visit_holiday'] else 0, axis=1)
data['dates']['visit_date']     = data['dates']['visit_date'].dt.strftime('%Y-%m-%d')

In [5]:
print('dates', data['dates'].shape)

data['dates'].head()

dates (517, 9)


Unnamed: 0,visit_date,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int
0,2016-01-01,1,1,1,2016,4,True,False,20160101
1,2016-01-02,3,2,1,2016,5,False,True,20160102
2,2016-01-03,4,3,1,2016,6,False,True,20160103
3,2016-01-04,0,4,1,2016,0,True,False,20160104
4,2016-01-05,0,5,1,2016,1,True,False,20160105


### Обрабатываем `train` и `test`

Приводим к единому формату

In [6]:
data['train']['id'] = data['train']['air_store_id'] + '_' + data['train']['visit_date']

data['test']['air_store_id'] = [id[:20] for id in data['test']['id']]
data['test']['visit_date']   = [id[21:] for id in data['test']['id']]

Дополнительные колонки даты

In [7]:
data['train'] = pd.merge(data['train'], data['dates'], how='left', on=['visit_date'])
data['test']  = pd.merge(data['test'],  data['dates'], how='left', on=['visit_date'])

In [8]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 12)
test  (32019, 12)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118


### Формируем `reserve`

 Расширяем `air_reserve` данными из `hpg_reserve`

In [9]:
hpg_reserve_air = pd.merge(data['hpg_reserve'], data['air_hpd'], how='inner', on=['hpg_store_id'])
data['reserve'] = pd.concat([data['air_reserve'], hpg_reserve_air.drop('hpg_store_id', axis=1)])

In [10]:
print('air_reserve', data['air_reserve'].shape)
print('reserve    ', data['reserve'].shape)

air_reserve (92378, 4)
reserve     (120561, 4)


Дополнительные колонки даты в `reserve`

In [11]:
data['reserve']['visit_datetime']   = pd.to_datetime(data['reserve']['visit_datetime'])
data['reserve']['reserve_datetime'] = pd.to_datetime(data['reserve']['reserve_datetime'])

data['reserve']['visit_date'] = data['reserve']['visit_datetime'].dt.strftime('%Y-%m-%d')
    
data['reserve']['reserve_diff'] = data['reserve']['visit_datetime'] - data['reserve']['reserve_datetime']
data['reserve']['reserve_diff'] = data['reserve'].apply(lambda row: row['reserve_diff'].days, axis=1)

Фильтруем резервы, оформленные на той же неделе, что и посещение

In [12]:
# data['reserve'] = data['reserve'][data['reserve']['reserve_diff'] > data['reserve']['visit_datetime'].dt.dayofweek]

In [13]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 6)


Unnamed: 0,air_store_id,reserve_datetime,reserve_visitors,visit_datetime,visit_date,reserve_diff
0,air_877f79706adbfb06,2016-01-01 16:00:00,1,2016-01-01 19:00:00,2016-01-01,0
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,3,2016-01-01 19:00:00,2016-01-01,0
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,6,2016-01-01 19:00:00,2016-01-01,0
3,air_877f79706adbfb06,2016-01-01 16:00:00,2,2016-01-01 20:00:00,2016-01-01,0
4,air_db80363d35f10926,2016-01-01 01:00:00,5,2016-01-01 20:00:00,2016-01-01,0


Объединяем записи в `reserve`

In [14]:
reserve = data['reserve'][['air_store_id', 'visit_date']]
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['reserve']
            .groupby(['air_store_id', 'visit_date'], as_index=False)
            [['reserve_diff', 'reserve_visitors']],
        op_type,
    )().rename(columns={
        'reserve_diff':     'reserve_diff_'     + op_type,
        'reserve_visitors': 'reserve_visitors_' + op_type,
    })
    
    reserve = pd.merge(reserve, tmp, how='left', on=['air_store_id', 'visit_date'])
    
data['reserve'] = reserve

In [15]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 14)


Unnamed: 0,air_store_id,visit_date,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
1,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
2,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
3,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
4,air_db80363d35f10926,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,4,0,5,2,2


Добавляем `reserve` в `train` / `test` 

In [16]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['reserve'], how='left', on=['air_store_id', 'visit_date'])

In [17]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 24)
test  (32959, 24)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,,,,,,,,,,,,
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,,,,,,,,,,,,
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,,,,,,,,,,,,
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,,,,,,,,,,,,
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,,,,,,,,,,,,


### Формируем `stores`

In [18]:
stores_id = data['air_stores']['air_store_id'].unique()
data['stores'] = pd.concat(
    [pd.DataFrame({
        'air_store_id':     stores_id,
        'visit_dow':        [i] * len(stores_id)
    }) for i in range(7)],
    axis=0,
    ignore_index=True,
).reset_index(drop=True)

In [19]:
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['train']
            .groupby(['air_store_id', 'visit_dow'], as_index=False)
            [['visitors']],
        op_type,
    )().rename(columns={
        'visitors': 'visitors_' + op_type,
    })
    
    data['stores'] = pd.merge(data['stores'], tmp, how='left', on=['air_store_id', 'visit_dow'])

In [20]:
data['stores'] = pd.merge(data['stores'], data['air_stores'], how='left', on=['air_store_id'])

In [21]:
data['stores']['air_genre_name'] = data['stores']['air_genre_name']
data['stores']['air_area_name']  = data['stores']['air_area_name']

Добавляем признаки на основе `latitude` и `longitude`

In [22]:
data['stores']['latitude_max']  = data['stores']['latitude'].max()  - data['stores']['latitude']
data['stores']['longitude_max'] = data['stores']['longitude'].max() - data['stores']['longitude']

data['stores']['latitude_longitude'] = data['stores']['latitude'] + data['stores']['longitude']

data['stores']['latitude_longitude_max'] = data['stores']['latitude_longitude'].max() - data['stores']['latitude_longitude']

In [23]:
print('stores', data['stores'].shape)

data['stores'].head()

stores (5803, 16)


Unnamed: 0,air_store_id,visit_dow,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_0f0cdeee6c9bf3d7,0,1165.0,21.181818,18.0,2.0,39.0,55.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
1,air_7cc17a324ae5c7dc,0,397.0,12.030303,9.0,1.0,34.0,33.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
2,air_fee8dcf4d619598e,0,830.0,20.243902,18.0,7.0,39.0,41.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
3,air_a17f0778617c76e2,0,158.0,26.333333,27.5,19.0,31.0,6.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
4,air_83db5aff8f50478e,0,248.0,7.085714,7.0,1.0,13.0,35.0,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363


### Объединияем `train` / `test` и `stores`

In [24]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['stores'], how='left', on=['air_store_id', 'visit_dow'])
    data[ds] = data[ds].fillna(data[ds].median(numeric_only=True))

In [25]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 38)
test  (32959, 38)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,16.0,17.0,3.75,3.333333,2.0,3.0,0.0,2.0,9.0,5.0,4.0,4.0,1526.0,23.84375,25.0,7.0,57.0,64.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,16.0,17.0,3.75,3.333333,2.0,3.0,0.0,2.0,9.0,5.0,4.0,4.0,1319.0,20.292308,21.0,2.0,54.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,16.0,17.0,3.75,3.333333,2.0,3.0,0.0,2.0,9.0,5.0,4.0,4.0,2258.0,34.738462,35.0,4.0,61.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,16.0,17.0,3.75,3.333333,2.0,3.0,0.0,2.0,9.0,5.0,4.0,4.0,1825.0,27.651515,27.0,6.0,53.0,66.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,16.0,17.0,3.75,3.333333,2.0,3.0,0.0,2.0,9.0,5.0,4.0,4.0,784.0,13.754386,12.0,2.0,34.0,57.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363


## Обучение

In [26]:
columns = pd.Series([col for col in data['train'] if col not in ['id', 'visit_date', 'visitors']])
cat_features = columns[columns.isin([
    'air_store_id',
    'visit_holiday',
    'visit_day',
    'visit_month',
    'visit_year',
    'visit_dow',
    'visit_work',
    'visit_weekend',
    'air_genre_name',
    'air_area_name',
])].index

In [27]:
train_pool = catboost.Pool(
    data['train'][columns],
    np.log1p(data['train']['visitors'].values),
    cat_features=cat_features.tolist(),
)

test_pool = catboost.Pool(
    data['test'][columns],
    cat_features=cat_features.tolist(),
)

In [28]:
model = catboost.CatBoostRegressor(
     iterations=1000,
)

model.fit(train_pool)

0:	learn: 2.9627216	total: 391ms	remaining: 6m 30s
1:	learn: 2.8765892	total: 951ms	remaining: 7m 54s
2:	learn: 2.7931129	total: 1.47s	remaining: 8m 7s
3:	learn: 2.7122143	total: 1.88s	remaining: 7m 48s
4:	learn: 2.6338317	total: 2.22s	remaining: 7m 22s
5:	learn: 2.5578778	total: 2.54s	remaining: 7m
6:	learn: 2.4842611	total: 2.9s	remaining: 6m 51s
7:	learn: 2.4129551	total: 3.35s	remaining: 6m 55s
8:	learn: 2.3438697	total: 3.71s	remaining: 6m 48s
9:	learn: 2.2769290	total: 4.15s	remaining: 6m 51s
10:	learn: 2.2120941	total: 4.53s	remaining: 6m 46s
11:	learn: 2.1492878	total: 5s	remaining: 6m 51s
12:	learn: 2.0884704	total: 5.4s	remaining: 6m 49s
13:	learn: 2.0295665	total: 5.8s	remaining: 6m 48s
14:	learn: 1.9725294	total: 6.29s	remaining: 6m 53s
15:	learn: 1.9173132	total: 6.64s	remaining: 6m 48s
16:	learn: 1.8638397	total: 7.18s	remaining: 6m 55s
17:	learn: 1.8120833	total: 7.53s	remaining: 6m 50s
18:	learn: 1.7619736	total: 7.83s	remaining: 6m 44s
19:	learn: 1.7134701	total: 8.1s	

158:	learn: 0.4763324	total: 47s	remaining: 4m 8s
159:	learn: 0.4762205	total: 47.4s	remaining: 4m 8s
160:	learn: 0.4761190	total: 47.7s	remaining: 4m 8s
161:	learn: 0.4759872	total: 48.1s	remaining: 4m 8s
162:	learn: 0.4758904	total: 48.4s	remaining: 4m 8s
163:	learn: 0.4757895	total: 48.7s	remaining: 4m 8s
164:	learn: 0.4756695	total: 49s	remaining: 4m 7s
165:	learn: 0.4753324	total: 49.3s	remaining: 4m 7s
166:	learn: 0.4752352	total: 49.6s	remaining: 4m 7s
167:	learn: 0.4751288	total: 50s	remaining: 4m 7s
168:	learn: 0.4750258	total: 50.4s	remaining: 4m 7s
169:	learn: 0.4747279	total: 50.9s	remaining: 4m 8s
170:	learn: 0.4746563	total: 51.3s	remaining: 4m 8s
171:	learn: 0.4745674	total: 51.7s	remaining: 4m 8s
172:	learn: 0.4744580	total: 52s	remaining: 4m 8s
173:	learn: 0.4744030	total: 52.4s	remaining: 4m 8s
174:	learn: 0.4743201	total: 52.8s	remaining: 4m 8s
175:	learn: 0.4741515	total: 53.1s	remaining: 4m 8s
176:	learn: 0.4740617	total: 53.5s	remaining: 4m 8s
177:	learn: 0.473996

313:	learn: 0.4620751	total: 1m 42s	remaining: 3m 43s
314:	learn: 0.4620459	total: 1m 42s	remaining: 3m 43s
315:	learn: 0.4620237	total: 1m 42s	remaining: 3m 42s
316:	learn: 0.4619905	total: 1m 43s	remaining: 3m 42s
317:	learn: 0.4619515	total: 1m 43s	remaining: 3m 41s
318:	learn: 0.4618907	total: 1m 43s	remaining: 3m 41s
319:	learn: 0.4618335	total: 1m 44s	remaining: 3m 41s
320:	learn: 0.4617768	total: 1m 44s	remaining: 3m 40s
321:	learn: 0.4616823	total: 1m 44s	remaining: 3m 40s
322:	learn: 0.4616055	total: 1m 44s	remaining: 3m 39s
323:	learn: 0.4615407	total: 1m 45s	remaining: 3m 39s
324:	learn: 0.4615148	total: 1m 45s	remaining: 3m 39s
325:	learn: 0.4614726	total: 1m 45s	remaining: 3m 38s
326:	learn: 0.4614301	total: 1m 46s	remaining: 3m 38s
327:	learn: 0.4614040	total: 1m 46s	remaining: 3m 37s
328:	learn: 0.4613817	total: 1m 46s	remaining: 3m 37s
329:	learn: 0.4613536	total: 1m 46s	remaining: 3m 36s
330:	learn: 0.4612986	total: 1m 47s	remaining: 3m 36s
331:	learn: 0.4612575	total:

467:	learn: 0.4564333	total: 2m 24s	remaining: 2m 44s
468:	learn: 0.4563960	total: 2m 25s	remaining: 2m 44s
469:	learn: 0.4563840	total: 2m 25s	remaining: 2m 43s
470:	learn: 0.4563742	total: 2m 25s	remaining: 2m 43s
471:	learn: 0.4563624	total: 2m 25s	remaining: 2m 43s
472:	learn: 0.4563414	total: 2m 26s	remaining: 2m 42s
473:	learn: 0.4563121	total: 2m 26s	remaining: 2m 42s
474:	learn: 0.4562902	total: 2m 26s	remaining: 2m 42s
475:	learn: 0.4562719	total: 2m 27s	remaining: 2m 41s
476:	learn: 0.4562480	total: 2m 27s	remaining: 2m 41s
477:	learn: 0.4561892	total: 2m 27s	remaining: 2m 41s
478:	learn: 0.4561813	total: 2m 27s	remaining: 2m 40s
479:	learn: 0.4561515	total: 2m 28s	remaining: 2m 40s
480:	learn: 0.4561300	total: 2m 28s	remaining: 2m 40s
481:	learn: 0.4561209	total: 2m 28s	remaining: 2m 39s
482:	learn: 0.4560827	total: 2m 28s	remaining: 2m 39s
483:	learn: 0.4560552	total: 2m 29s	remaining: 2m 39s
484:	learn: 0.4560396	total: 2m 29s	remaining: 2m 38s
485:	learn: 0.4560258	total:

621:	learn: 0.4530370	total: 3m 7s	remaining: 1m 53s
622:	learn: 0.4530239	total: 3m 7s	remaining: 1m 53s
623:	learn: 0.4530023	total: 3m 7s	remaining: 1m 53s
624:	learn: 0.4529712	total: 3m 8s	remaining: 1m 52s
625:	learn: 0.4529668	total: 3m 8s	remaining: 1m 52s
626:	learn: 0.4529567	total: 3m 9s	remaining: 1m 52s
627:	learn: 0.4529432	total: 3m 9s	remaining: 1m 52s
628:	learn: 0.4529257	total: 3m 9s	remaining: 1m 51s
629:	learn: 0.4529040	total: 3m 9s	remaining: 1m 51s
630:	learn: 0.4528845	total: 3m 10s	remaining: 1m 51s
631:	learn: 0.4528666	total: 3m 10s	remaining: 1m 50s
632:	learn: 0.4528498	total: 3m 10s	remaining: 1m 50s
633:	learn: 0.4528114	total: 3m 11s	remaining: 1m 50s
634:	learn: 0.4528044	total: 3m 11s	remaining: 1m 50s
635:	learn: 0.4527950	total: 3m 11s	remaining: 1m 49s
636:	learn: 0.4527792	total: 3m 11s	remaining: 1m 49s
637:	learn: 0.4527516	total: 3m 12s	remaining: 1m 49s
638:	learn: 0.4527289	total: 3m 12s	remaining: 1m 48s
639:	learn: 0.4527163	total: 3m 12s	r

774:	learn: 0.4506449	total: 3m 52s	remaining: 1m 7s
775:	learn: 0.4506381	total: 3m 52s	remaining: 1m 7s
776:	learn: 0.4506284	total: 3m 52s	remaining: 1m 6s
777:	learn: 0.4506238	total: 3m 52s	remaining: 1m 6s
778:	learn: 0.4506091	total: 3m 53s	remaining: 1m 6s
779:	learn: 0.4506018	total: 3m 53s	remaining: 1m 5s
780:	learn: 0.4505809	total: 3m 53s	remaining: 1m 5s
781:	learn: 0.4505591	total: 3m 54s	remaining: 1m 5s
782:	learn: 0.4505475	total: 3m 54s	remaining: 1m 4s
783:	learn: 0.4505170	total: 3m 54s	remaining: 1m 4s
784:	learn: 0.4504967	total: 3m 54s	remaining: 1m 4s
785:	learn: 0.4504834	total: 3m 55s	remaining: 1m 3s
786:	learn: 0.4504597	total: 3m 55s	remaining: 1m 3s
787:	learn: 0.4504519	total: 3m 55s	remaining: 1m 3s
788:	learn: 0.4504362	total: 3m 55s	remaining: 1m 3s
789:	learn: 0.4504259	total: 3m 56s	remaining: 1m 2s
790:	learn: 0.4504010	total: 3m 56s	remaining: 1m 2s
791:	learn: 0.4503826	total: 3m 56s	remaining: 1m 2s
792:	learn: 0.4503631	total: 3m 56s	remaining:

931:	learn: 0.4486831	total: 4m 34s	remaining: 20s
932:	learn: 0.4486664	total: 4m 34s	remaining: 19.7s
933:	learn: 0.4486513	total: 4m 35s	remaining: 19.4s
934:	learn: 0.4486365	total: 4m 35s	remaining: 19.1s
935:	learn: 0.4486344	total: 4m 35s	remaining: 18.9s
936:	learn: 0.4486278	total: 4m 36s	remaining: 18.6s
937:	learn: 0.4486186	total: 4m 36s	remaining: 18.3s
938:	learn: 0.4486092	total: 4m 36s	remaining: 18s
939:	learn: 0.4486015	total: 4m 36s	remaining: 17.7s
940:	learn: 0.4485806	total: 4m 37s	remaining: 17.4s
941:	learn: 0.4485726	total: 4m 37s	remaining: 17.1s
942:	learn: 0.4485660	total: 4m 37s	remaining: 16.8s
943:	learn: 0.4485517	total: 4m 37s	remaining: 16.5s
944:	learn: 0.4485450	total: 4m 38s	remaining: 16.2s
945:	learn: 0.4485384	total: 4m 38s	remaining: 15.9s
946:	learn: 0.4485308	total: 4m 38s	remaining: 15.6s
947:	learn: 0.4485181	total: 4m 39s	remaining: 15.3s
948:	learn: 0.4485068	total: 4m 39s	remaining: 15s
949:	learn: 0.4484972	total: 4m 39s	remaining: 14.7s

<catboost.core.CatBoostRegressor at 0x7fe822ea7358>

In [29]:
train_preds = model.predict(train_pool)
print('RMSLE:', RMSLE(np.log1p(data['train']['visitors'].values), train_preds))

RMSLE: 0.387908972211


In [30]:
test_preds = model.predict(test_pool)
data['test']['visitors'] = np.expm1(test_preds).clip(min=0.)

In [31]:
test = data['test'][['id', 'visitors']]
test = test.groupby('id', as_index=False).mean()

In [32]:
test.to_csv('result/submission.csv', index=False)