# Restaurant Visitor Forecasting by GooseLearning

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
import catboost

pd.options.display.max_columns = 1000

lbl = preprocessing.LabelEncoder()

In [2]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** .5

In [3]:
data = {
    'air_stores': pd.read_csv('dataset/air_store_info.csv'),
    'hpg_stores': pd.read_csv('dataset/hpg_store_info.csv'),

    'air_reserve': pd.read_csv('dataset/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('dataset/hpg_reserve.csv'),

    'air_hpd': pd.read_csv('dataset/store_id_relation.csv'),
    
    'dates': pd.read_csv('dataset/date_info.csv').rename(columns={
        'calendar_date': 'visit_date',
        'holiday_flg'  : 'visit_holiday',
    })[['visit_date', 'visit_holiday']],

    'train': pd.read_csv('dataset/air_visit_data.csv'),
    'test':  pd.read_csv('dataset/sample_submission.csv'),
}

## Обработка данных

### Обрабатываем `dates`

In [4]:
holydays = np.array([1, 2, 1, 1, 1, 3, 4])

data['dates']['visit_date']     = pd.to_datetime(data['dates']['visit_date'])
data['dates']['visit_day']      = data['dates']['visit_date'].dt.day
data['dates']['visit_month']    = data['dates']['visit_date'].dt.month
data['dates']['visit_year']     = data['dates']['visit_date'].dt.year
data['dates']['visit_dow']      = data['dates']['visit_date'].dt.dayofweek
data['dates']['visit_work']     = data['dates']['visit_dow'] < 5
data['dates']['visit_weekend']  = data['dates']['visit_work'] == False
data['dates']['visit_date_int'] = data['dates']['visit_date'].dt.strftime('%Y%m%d').astype(int)
data['dates']['visit_holiday']  = data['dates'].apply(lambda row: holydays[row['visit_dow']] if row['visit_holiday'] else 0, axis=1)
data['dates']['visit_date']     = data['dates']['visit_date'].dt.strftime('%Y-%m-%d')

In [5]:
print('dates', data['dates'].shape)

data['dates'].head()

dates (517, 9)


Unnamed: 0,visit_date,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int
0,2016-01-01,1,1,1,2016,4,True,False,20160101
1,2016-01-02,3,2,1,2016,5,False,True,20160102
2,2016-01-03,4,3,1,2016,6,False,True,20160103
3,2016-01-04,0,4,1,2016,0,True,False,20160104
4,2016-01-05,0,5,1,2016,1,True,False,20160105


### Обрабатываем `train` и `test`

Приводим к единому формату

In [6]:
data['train']['id'] = data['train']['air_store_id'] + '_' + data['train']['visit_date']

data['test']['air_store_id'] = [id[:20] for id in data['test']['id']]
data['test']['visit_date']   = [id[21:] for id in data['test']['id']]

Дополнительные колонки даты

In [7]:
data['train'] = pd.merge(data['train'], data['dates'], how='left', on=['visit_date'])
data['test']  = pd.merge(data['test'],  data['dates'], how='left', on=['visit_date'])

In [8]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (252108, 12)
test  (32019, 12)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118


### Формируем `reserve`

 Расширяем `air_reserve` данными из `hpg_reserve`

In [9]:
hpg_reserve_air = pd.merge(data['hpg_reserve'], data['air_hpd'], how='inner', on=['hpg_store_id'])
data['reserve'] = pd.concat([data['air_reserve'], hpg_reserve_air.drop('hpg_store_id', axis=1)])

In [10]:
print('air_reserve', data['air_reserve'].shape)
print('reserve    ', data['reserve'].shape)

air_reserve (92378, 4)
reserve     (120561, 4)


Дополнительные колонки даты в `reserve`

In [11]:
data['reserve']['visit_datetime']   = pd.to_datetime(data['reserve']['visit_datetime'])
data['reserve']['reserve_datetime'] = pd.to_datetime(data['reserve']['reserve_datetime'])

data['reserve']['visit_date'] = data['reserve']['visit_datetime'].dt.strftime('%Y-%m-%d')
    
data['reserve']['reserve_diff'] = data['reserve']['visit_datetime'] - data['reserve']['reserve_datetime']
data['reserve']['reserve_diff'] = data['reserve'].apply(lambda row: row['reserve_diff'].days, axis=1)

Фильтруем резервы, оформленные на той же неделе, что и посещение

In [12]:
# data['reserve'] = data['reserve'][data['reserve']['reserve_diff'] > data['reserve']['visit_datetime'].dt.dayofweek]

In [13]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 6)


Unnamed: 0,air_store_id,reserve_datetime,reserve_visitors,visit_datetime,visit_date,reserve_diff
0,air_877f79706adbfb06,2016-01-01 16:00:00,1,2016-01-01 19:00:00,2016-01-01,0
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,3,2016-01-01 19:00:00,2016-01-01,0
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,6,2016-01-01 19:00:00,2016-01-01,0
3,air_877f79706adbfb06,2016-01-01 16:00:00,2,2016-01-01 20:00:00,2016-01-01,0
4,air_db80363d35f10926,2016-01-01 01:00:00,5,2016-01-01 20:00:00,2016-01-01,0


Объединяем записи в `reserve`

In [14]:
reserve = data['reserve'][['air_store_id', 'visit_date']]
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['reserve']
            .groupby(['air_store_id', 'visit_date'], as_index=False)
            [['reserve_diff', 'reserve_visitors']],
        op_type,
    )().rename(columns={
        'reserve_diff':     'reserve_diff_'     + op_type,
        'reserve_visitors': 'reserve_visitors_' + op_type,
    })
    
    reserve = pd.merge(reserve, tmp, how='left', on=['air_store_id', 'visit_date'])
    
data['reserve'] = reserve

In [15]:
print('reserve', data['reserve'].shape)

data['reserve'].head()

reserve (120561, 14)


Unnamed: 0,air_store_id,visit_date,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
1,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
2,air_db4b38ebe7a7ceff,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,3,0,6,2,2
3,air_877f79706adbfb06,2016-01-01,0,3,0.0,1.5,0.0,1.5,0,1,0,2,2,2
4,air_db80363d35f10926,2016-01-01,0,9,0.0,4.5,0.0,4.5,0,4,0,5,2,2


Добавляем `reserve` в `train` / `test` 

In [16]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['reserve'], how='left', on=['air_store_id', 'visit_date'])
    data[ds] = data[ds].fillna(0)

In [17]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 24)
test  (32959, 24)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Формируем `stores`

In [18]:
stores_id = data['air_stores']['air_store_id'].unique()
data['stores'] = pd.concat(
    [pd.DataFrame({
        'air_store_id':     stores_id,
        'visit_dow':        [i] * len(stores_id)
    }) for i in range(7)],
    axis=0,
    ignore_index=True,
).reset_index(drop=True)

In [19]:
for op_type in ['sum', 'mean', 'median', 'min', 'max', 'count']:
    tmp = getattr(
        data['train']
            .groupby(['air_store_id', 'visit_dow'], as_index=False)
            [['visitors']],
        op_type,
    )().rename(columns={
        'visitors': 'visitors_' + op_type,
    })
    
    data['stores'] = pd.merge(data['stores'], tmp, how='left', on=['air_store_id', 'visit_dow'])

In [20]:
data['stores'] = pd.merge(data['stores'], data['air_stores'], how='left', on=['air_store_id'])

In [21]:
data['stores']['air_genre_name'] = data['stores']['air_genre_name']
data['stores']['air_area_name']  = data['stores']['air_area_name']

Добавляем признаки на основе `latitude` и `longitude`

In [22]:
data['stores']['latitude_max']  = data['stores']['latitude'].max()  - data['stores']['latitude']
data['stores']['longitude_max'] = data['stores']['longitude'].max() - data['stores']['longitude']

data['stores']['latitude_longitude'] = data['stores']['latitude'] + data['stores']['longitude']

data['stores']['latitude_longitude_max'] = data['stores']['latitude_longitude'].max() - data['stores']['latitude_longitude']

In [23]:
print('stores', data['stores'].shape)

data['stores'].head()

stores (5803, 16)


Unnamed: 0,air_store_id,visit_dow,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_0f0cdeee6c9bf3d7,0,1165.0,21.181818,18.0,2.0,39.0,55.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
1,air_7cc17a324ae5c7dc,0,397.0,12.030303,9.0,1.0,34.0,33.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
2,air_fee8dcf4d619598e,0,830.0,20.243902,18.0,7.0,39.0,41.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
3,air_a17f0778617c76e2,0,158.0,26.333333,27.5,19.0,31.0,6.0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,9.325508,9.075546,169.892977,18.401054
4,air_83db5aff8f50478e,0,248.0,7.085714,7.0,1.0,13.0,35.0,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363


### Объединияем `train` / `test` и `stores`

In [24]:
for ds in ['train', 'test']:
    data[ds] = pd.merge(data[ds], data['stores'], how='left', on=['air_store_id', 'visit_dow'])

In [25]:
print('train', data['train'].shape)
print('test ', data['test'].shape)

data['train'].head()

train (324849, 38)
test  (32959, 38)


Unnamed: 0,air_store_id,visit_date,visitors,id,visit_holiday,visit_day,visit_month,visit_year,visit_dow,visit_work,visit_weekend,visit_date_int,reserve_diff_sum,reserve_visitors_sum,reserve_diff_mean,reserve_visitors_mean,reserve_diff_median,reserve_visitors_median,reserve_diff_min,reserve_visitors_min,reserve_diff_max,reserve_visitors_max,reserve_diff_count,reserve_visitors_count,visitors_sum,visitors_mean,visitors_median,visitors_min,visitors_max,visitors_count,air_genre_name,air_area_name,latitude,longitude,latitude_max,longitude_max,latitude_longitude,latitude_longitude_max
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13,0,13,1,2016,2,True,False,20160113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1526.0,23.84375,25.0,7.0,57.0,64.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14,0,14,1,2016,3,True,False,20160114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1319.0,20.292308,21.0,2.0,54.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15,0,15,1,2016,4,True,False,20160115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2258.0,34.738462,35.0,4.0,61.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16,0,16,1,2016,5,False,True,20160116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1825.0,27.651515,27.0,6.0,53.0,66.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18,0,18,1,2016,0,True,False,20160118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,784.0,13.754386,12.0,2.0,34.0,57.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,8.362564,4.521799,175.409667,12.884363


## Обучение

In [26]:
columns = pd.Series([col for col in data['train'] if col not in ['id', 'visit_date', 'visitors']])
cat_features = columns[columns.isin([
    'air_store_id',
    'visit_holiday',
    'visit_day',
    'visit_month',
    'visit_year',
    'visit_dow',
    'visit_work',
    'visit_weekend',
    'air_genre_name',
    'air_area_name',
])].index

In [27]:
train_pool = catboost.Pool(
    data['train'][columns],
    np.log1p(data['train']['visitors'].values),
    cat_features=cat_features.tolist(),
)

test_pool = catboost.Pool(
    data['test'][columns],
    cat_features=cat_features.tolist(),
)

In [28]:
model = catboost.CatBoostRegressor(
     iterations=1000,
)

model.fit(train_pool)

0:	learn: 2.9627196	total: 397ms	remaining: 6m 36s
1:	learn: 2.8766005	total: 664ms	remaining: 5m 31s
2:	learn: 2.7931125	total: 925ms	remaining: 5m 7s
3:	learn: 2.7122098	total: 1.18s	remaining: 4m 55s
4:	learn: 2.6338169	total: 1.39s	remaining: 4m 36s
5:	learn: 2.5578537	total: 1.64s	remaining: 4m 31s
6:	learn: 2.4842600	total: 1.84s	remaining: 4m 21s
7:	learn: 2.4129383	total: 2.08s	remaining: 4m 18s
8:	learn: 2.3438343	total: 2.33s	remaining: 4m 16s
9:	learn: 2.2769116	total: 2.55s	remaining: 4m 12s
10:	learn: 2.2120643	total: 2.78s	remaining: 4m 10s
11:	learn: 2.1492595	total: 2.99s	remaining: 4m 6s
12:	learn: 2.0884394	total: 3.25s	remaining: 4m 6s
13:	learn: 2.0295151	total: 3.48s	remaining: 4m 4s
14:	learn: 1.9724761	total: 3.68s	remaining: 4m 1s
15:	learn: 1.9172408	total: 3.91s	remaining: 4m
16:	learn: 1.8637503	total: 4.15s	remaining: 3m 59s
17:	learn: 1.8119755	total: 4.38s	remaining: 3m 58s
18:	learn: 1.7618616	total: 4.63s	remaining: 3m 58s
19:	learn: 1.7133575	total: 4.8

158:	learn: 0.4751320	total: 38.8s	remaining: 3m 25s
159:	learn: 0.4748538	total: 39.8s	remaining: 3m 28s
160:	learn: 0.4744537	total: 40.2s	remaining: 3m 29s
161:	learn: 0.4743418	total: 40.6s	remaining: 3m 29s
162:	learn: 0.4739905	total: 41.2s	remaining: 3m 31s
163:	learn: 0.4737696	total: 41.5s	remaining: 3m 31s
164:	learn: 0.4735765	total: 41.9s	remaining: 3m 31s
165:	learn: 0.4733490	total: 42.3s	remaining: 3m 32s
166:	learn: 0.4732733	total: 42.5s	remaining: 3m 32s
167:	learn: 0.4729627	total: 42.8s	remaining: 3m 31s
168:	learn: 0.4728652	total: 43.1s	remaining: 3m 31s
169:	learn: 0.4726640	total: 43.4s	remaining: 3m 31s
170:	learn: 0.4725203	total: 43.7s	remaining: 3m 31s
171:	learn: 0.4724421	total: 44s	remaining: 3m 31s
172:	learn: 0.4723724	total: 44.2s	remaining: 3m 31s
173:	learn: 0.4723155	total: 44.5s	remaining: 3m 31s
174:	learn: 0.4720219	total: 44.8s	remaining: 3m 31s
175:	learn: 0.4719427	total: 45s	remaining: 3m 30s
176:	learn: 0.4718702	total: 45.3s	remaining: 3m 3

313:	learn: 0.4606340	total: 1m 22s	remaining: 3m
314:	learn: 0.4605308	total: 1m 22s	remaining: 3m
315:	learn: 0.4605122	total: 1m 23s	remaining: 3m
316:	learn: 0.4604820	total: 1m 23s	remaining: 2m 59s
317:	learn: 0.4604258	total: 1m 23s	remaining: 2m 59s
318:	learn: 0.4603782	total: 1m 24s	remaining: 2m 59s
319:	learn: 0.4603099	total: 1m 24s	remaining: 2m 59s
320:	learn: 0.4602352	total: 1m 24s	remaining: 2m 58s
321:	learn: 0.4601820	total: 1m 24s	remaining: 2m 58s
322:	learn: 0.4601085	total: 1m 25s	remaining: 2m 58s
323:	learn: 0.4600215	total: 1m 25s	remaining: 2m 58s
324:	learn: 0.4599907	total: 1m 25s	remaining: 2m 57s
325:	learn: 0.4599398	total: 1m 25s	remaining: 2m 57s
326:	learn: 0.4599071	total: 1m 26s	remaining: 2m 57s
327:	learn: 0.4598792	total: 1m 26s	remaining: 2m 56s
328:	learn: 0.4598372	total: 1m 26s	remaining: 2m 56s
329:	learn: 0.4597946	total: 1m 26s	remaining: 2m 56s
330:	learn: 0.4597628	total: 1m 27s	remaining: 2m 56s
331:	learn: 0.4597413	total: 1m 27s	rema

466:	learn: 0.4550293	total: 2m 3s	remaining: 2m 20s
467:	learn: 0.4550138	total: 2m 3s	remaining: 2m 20s
468:	learn: 0.4549901	total: 2m 4s	remaining: 2m 20s
469:	learn: 0.4549560	total: 2m 4s	remaining: 2m 20s
470:	learn: 0.4549447	total: 2m 4s	remaining: 2m 19s
471:	learn: 0.4549339	total: 2m 4s	remaining: 2m 19s
472:	learn: 0.4549245	total: 2m 5s	remaining: 2m 19s
473:	learn: 0.4548914	total: 2m 5s	remaining: 2m 19s
474:	learn: 0.4548743	total: 2m 5s	remaining: 2m 19s
475:	learn: 0.4547896	total: 2m 6s	remaining: 2m 18s
476:	learn: 0.4547716	total: 2m 6s	remaining: 2m 18s
477:	learn: 0.4547498	total: 2m 6s	remaining: 2m 18s
478:	learn: 0.4547192	total: 2m 6s	remaining: 2m 18s
479:	learn: 0.4546882	total: 2m 7s	remaining: 2m 17s
480:	learn: 0.4546676	total: 2m 7s	remaining: 2m 17s
481:	learn: 0.4546397	total: 2m 7s	remaining: 2m 17s
482:	learn: 0.4546208	total: 2m 8s	remaining: 2m 17s
483:	learn: 0.4546034	total: 2m 8s	remaining: 2m 16s
484:	learn: 0.4545867	total: 2m 8s	remaining: 

620:	learn: 0.4515992	total: 2m 45s	remaining: 1m 40s
621:	learn: 0.4515672	total: 2m 45s	remaining: 1m 40s
622:	learn: 0.4515403	total: 2m 45s	remaining: 1m 40s
623:	learn: 0.4515231	total: 2m 46s	remaining: 1m 40s
624:	learn: 0.4515027	total: 2m 46s	remaining: 1m 39s
625:	learn: 0.4514884	total: 2m 46s	remaining: 1m 39s
626:	learn: 0.4514598	total: 2m 47s	remaining: 1m 39s
627:	learn: 0.4514443	total: 2m 47s	remaining: 1m 39s
628:	learn: 0.4514238	total: 2m 47s	remaining: 1m 38s
629:	learn: 0.4514133	total: 2m 47s	remaining: 1m 38s
630:	learn: 0.4513890	total: 2m 48s	remaining: 1m 38s
631:	learn: 0.4513588	total: 2m 48s	remaining: 1m 38s
632:	learn: 0.4513482	total: 2m 48s	remaining: 1m 37s
633:	learn: 0.4513322	total: 2m 48s	remaining: 1m 37s
634:	learn: 0.4513139	total: 2m 49s	remaining: 1m 37s
635:	learn: 0.4513004	total: 2m 49s	remaining: 1m 36s
636:	learn: 0.4512894	total: 2m 49s	remaining: 1m 36s
637:	learn: 0.4512843	total: 2m 49s	remaining: 1m 36s
638:	learn: 0.4512732	total:

774:	learn: 0.4493000	total: 3m 28s	remaining: 1m
775:	learn: 0.4492855	total: 3m 28s	remaining: 1m
776:	learn: 0.4492508	total: 3m 28s	remaining: 59.9s
777:	learn: 0.4492449	total: 3m 29s	remaining: 59.7s
778:	learn: 0.4492395	total: 3m 29s	remaining: 59.4s
779:	learn: 0.4492276	total: 3m 29s	remaining: 59.1s
780:	learn: 0.4492091	total: 3m 30s	remaining: 58.9s
781:	learn: 0.4491903	total: 3m 30s	remaining: 58.6s
782:	learn: 0.4491817	total: 3m 30s	remaining: 58.4s
783:	learn: 0.4491731	total: 3m 30s	remaining: 58.1s
784:	learn: 0.4491607	total: 3m 31s	remaining: 57.8s
785:	learn: 0.4491439	total: 3m 31s	remaining: 57.6s
786:	learn: 0.4491351	total: 3m 31s	remaining: 57.3s
787:	learn: 0.4491289	total: 3m 31s	remaining: 57s
788:	learn: 0.4491225	total: 3m 32s	remaining: 56.7s
789:	learn: 0.4491161	total: 3m 32s	remaining: 56.5s
790:	learn: 0.4491008	total: 3m 32s	remaining: 56.2s
791:	learn: 0.4490893	total: 3m 32s	remaining: 55.9s
792:	learn: 0.4490848	total: 3m 33s	remaining: 55.6s
7

930:	learn: 0.4473850	total: 4m 11s	remaining: 18.7s
931:	learn: 0.4473761	total: 4m 12s	remaining: 18.4s
932:	learn: 0.4473602	total: 4m 12s	remaining: 18.1s
933:	learn: 0.4473431	total: 4m 12s	remaining: 17.9s
934:	learn: 0.4473397	total: 4m 12s	remaining: 17.6s
935:	learn: 0.4473254	total: 4m 13s	remaining: 17.3s
936:	learn: 0.4473161	total: 4m 13s	remaining: 17s
937:	learn: 0.4473044	total: 4m 13s	remaining: 16.8s
938:	learn: 0.4472995	total: 4m 13s	remaining: 16.5s
939:	learn: 0.4472885	total: 4m 14s	remaining: 16.2s
940:	learn: 0.4472811	total: 4m 14s	remaining: 16s
941:	learn: 0.4472492	total: 4m 14s	remaining: 15.7s
942:	learn: 0.4472398	total: 4m 14s	remaining: 15.4s
943:	learn: 0.4472322	total: 4m 15s	remaining: 15.1s
944:	learn: 0.4472246	total: 4m 15s	remaining: 14.9s
945:	learn: 0.4472186	total: 4m 15s	remaining: 14.6s
946:	learn: 0.4472099	total: 4m 15s	remaining: 14.3s
947:	learn: 0.4471946	total: 4m 16s	remaining: 14.1s
948:	learn: 0.4471873	total: 4m 16s	remaining: 13.

<catboost.core.CatBoostRegressor at 0x7f86bfaeedd8>

In [29]:
train_preds = model.predict(train_pool)
print('RMSLE:', RMSLE(np.log1p(data['train']['visitors'].values), train_preds))

RMSLE: 0.405879169048


In [30]:
test_preds = model.predict(test_pool)
data['test']['visitors'] = np.expm1(test_preds).clip(min=0.)

In [31]:
test = data['test'][['id', 'visitors']]
test = test.groupby('id', as_index=False).mean()

In [32]:
test.to_csv('result/submission.csv', index=False)