# M5-forecasting, CatBoost.

In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/b1/61/2b8106c8870601671d99ca94d8b8d180f2b740b7cdb95c930147508abcf9/catboost-0.23-cp36-none-manylinux1_x86_64.whl (64.7MB)
[K     |████████████████████████████████| 64.8MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.23


In [0]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from catboost import Pool, CatBoostRegressor
from catboost.utils import get_gpu_device_count
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [18]:
print('GPU devices СatBoost:', get_gpu_device_count())

GPU devices СatBoost: 1


In [3]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [19]:
DATA_DIR = '/gdrive/My Drive/M5-forecasting'
MODEL_VER = 'v0'
BACKWARD_LAGS = 60
END_D = 1913
CUT_D = END_D - int(365 * 1.2)
END_DATE = '2016-04-24'
print(datetime.strptime(END_DATE, '%Y-%m-%d'))
np.random.seed(0)

2016-04-24 00:00:00


## Загрузка данных

In [0]:
CALENDAR_DTYPES = {
    'date':             'str',
    'wm_yr_wk':         'int16', 
    'weekday':          'object',
    'wday':             'int16', 
    'month':            'int16', 
    'year':             'int16', 
    'd':                'object',
    'event_name_1':     'object',
    'event_type_1':     'object',
    'event_name_2':     'object',
    'event_type_2':     'object',
    'snap_CA':          'int16', 
    'snap_TX':          'int16', 
    'snap_WI':          'int16'
}
PARSE_DATES = ['date']
SPRICES_DTYPES = {
    'store_id':    'object', 
    'item_id':     'object', 
    'wm_yr_wk':    'int16',  
    'sell_price':  'float32'
}

In [0]:
def get_df(is_train:bool=True, 
           backward_lags:int=None):
    strain = pd.read_csv('{}/sales_train_validation.csv'.format(DATA_DIR))
    print('read train:', strain.shape)
    cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    last_day = int(strain.columns[-1].replace('d_', ''))
    print('first day is:', CUT_D)
    print('last day is:', last_day)
    if not is_train:
        for day in range(last_day + 1, last_day + 28 + 28 + 1):
            strain['d_{}'.format(day)] = np.nan
        value_vars = [col for col in strain.columns 
                      if (col.startswith('d_') and (int(col.replace('d_', '')) >= END_D - backward_lags))]
    else:
        value_vars = [col for col in strain.columns 
                      if (col.startswith('d_') and (int(col.replace('d_', '')) >= CUT_D))]
    strain = pd.melt(
        strain,
        id_vars = cat_cols,
        value_vars = value_vars,
        var_name = 'd',
        value_name = 'sales'
    )
    print('melted train:', strain.shape)
    calendar = pd.read_csv('{}/calendar.csv'.format(DATA_DIR), dtype=CALENDAR_DTYPES, parse_dates=PARSE_DATES)
    print('read calendar:', calendar.shape)
    strain = strain.merge(calendar, on='d', copy=False)
    del calendar
    gc.collect()
    print('calendar merge done')
    sprices = pd.read_csv('{}/sell_prices.csv'.format(DATA_DIR), dtype=SPRICES_DTYPES)
    print('read prices:', sprices.shape)
    strain = strain.merge(
        sprices, 
        on=['store_id', 'item_id', 'wm_yr_wk'], 
        copy=False
    )
    del sprices
    gc.collect()
    print('prices merge done')
    print('begin train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    if not is_train:
        strain = strain.loc[
            strain['date'] >= (datetime.strptime(END_DATE, '%Y-%m-%d') - timedelta(days=backward_lags))
        ]
    print('date cut train:', strain.shape)
    print('cut train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    return strain

In [0]:
def make_features(strain):
    print('in dataframe:', strain.shape)
    lags = [7, 28]
    windows= [7, 28]
    wnd_feats = ['id', 'item_id']
    lag_cols = ['lag_{}'.format(lag) for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        strain[lag_col] = strain[['id', 'sales']].groupby('id')['sales'].shift(lag)
    print('lag sales done')
    for wnd_feat in wnd_feats:
        for wnd in windows:
            for lag_col in lag_cols:
                wnd_col = '{}_{}_rmean_{}'.format(lag_col, wnd_feat, wnd)
                strain[wnd_col] = strain[[wnd_feat, lag_col]].groupby(wnd_feat)[lag_col].transform(
                    lambda x: x.rolling(wnd).mean()
                )
        print('rolling mean sales for feature done:', wnd_feat)
    date_features = {
        'week_num': 'weekofyear',
        'quarter': 'quarter',
        'mday': 'day'
    }
    for date_feat_name, date_feat_func in date_features.items():
        strain[date_feat_name] = getattr(strain['date'].dt, date_feat_func).astype('int16')
    print('date features done')
    strain['d'] = strain['d'].apply(lambda x: int(x.replace('d_', '')))  
    print('out dataframe:', strain.shape)
    return strain

In [23]:
%%time
strain = get_df(is_train=True, backward_lags=None)
strain = make_features(strain)

read train: (30490, 1919)
first day is: 1475
last day is: 1913
melted train: (13385110, 8)
read calendar: (1969, 14)
calendar merge done
read prices: (6841121, 4)
prices merge done
begin train date: 2015-02-11 00:00:00
end train date: 2016-04-24 00:00:00
date cut train: (13315034, 22)
cut train date: 2015-02-11 00:00:00
end train date: 2016-04-24 00:00:00
in dataframe: (13315034, 22)
lag sales done
rolling mean sales for feature done: id
rolling mean sales for feature done: item_id
date features done
out dataframe: (13315034, 35)
CPU times: user 2min 24s, sys: 11.6 s, total: 2min 35s
Wall time: 2min 36s


In [25]:
strain.head(3)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,lag_7_id_rmean_7,lag_28_id_rmean_7,lag_7_id_rmean_28,lag_28_id_rmean_28,lag_7_item_id_rmean_7,lag_28_item_id_rmean_7,lag_7_item_id_rmean_28,lag_28_item_id_rmean_28,week_num,quarter,mday
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA_1,HOBBIES,CA,1475,0,2015-02-11,11502,Wednesday,5,2,2015,,,,,0,1,1,8.26,,,,,,,,,,,7,1,11
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA_1,HOBBIES,CA,1476,0,2015-02-12,11502,Thursday,6,2,2015,,,,,0,1,1,8.26,,,,,,,,,,,7,1,12
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA_1,HOBBIES,CA,1477,3,2015-02-13,11502,Friday,7,2,2015,,,,,0,1,0,8.26,,,,,,,,,,,7,1,13


In [0]:
drop_cols = ['id', 'sales', 'date', 'wm_yr_wk', 'weekday']
train_cols = strain.columns[~strain.columns.isin(drop_cols)]
cat_cols = [
    'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 
    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
]
strain[cat_cols] = strain[cat_cols].fillna(0)

## CatBoost

In [10]:
%%time
val_size = int(strain.shape[0] * .15)
val_idxs = np.random.choice(strain.index.values, val_size, replace=False)
train_idxs = np.setdiff1d(strain.index.values, val_idxs)
train_pool = Pool(
    strain.loc[train_idxs][train_cols], 
    strain.loc[train_idxs]['sales'],
    cat_features=cat_cols
)
val_pool = Pool(
    strain.loc[val_idxs][train_cols], 
    strain.loc[val_idxs]['sales'],
    cat_features=cat_cols
)
del strain
gc.collect()

CPU times: user 42.1 s, sys: 1.85 s, total: 44 s
Wall time: 41.1 s


In [11]:
model = CatBoostRegressor(
    iterations=1000,
    task_type='GPU',
    verbose=200,
    loss_function='RMSE',
    boosting_type='Plain',
    depth=8,
    gpu_cat_features_storage='CpuPinnedMemory',
    #max_ctr_complexity=2
model.fit(
    train_pool,
    eval_set = val_pool
    #plot=True   
)
del train_pool, val_pool
gc.collect()

Learning rate set to 0.35204
0:	learn: 2.9428610	test: 2.9549983	best: 2.9549983 (0)	total: 1.32s	remaining: 21m 59s
200:	learn: 1.9990122	test: 2.0761268	best: 2.0761084 (199)	total: 3m 36s	remaining: 14m 22s
400:	learn: 1.9394297	test: 2.0586270	best: 2.0586270 (400)	total: 7m 10s	remaining: 10m 42s
600:	learn: 1.9001072	test: 2.0478947	best: 2.0478416 (598)	total: 10m 50s	remaining: 7m 11s
800:	learn: 1.8674147	test: 2.0435845	best: 2.0432455 (789)	total: 14m 27s	remaining: 3m 35s
999:	learn: 1.8398260	test: 2.0388751	best: 2.0387949 (998)	total: 18m 2s	remaining: 0us
bestTest = 2.038794938
bestIteration = 998
Shrink model to first 999 iterations.


0

In [0]:
model.save_model('model_{}.cbm'.format(MODEL_VER))

## Prediction Loop

In [13]:
%%time
spred = get_df(is_train=False, backward_lags=BACKWARD_LAGS)
for pred_day in tqdm(range(1, 28 + 28 + 1)):
    pred_date = datetime.strptime(END_DATE, '%Y-%m-%d') + timedelta(days=pred_day)
    pred_date_back = pred_date - timedelta(days=BACKWARD_LAGS + 1)
    print('-' * 70)
    print('forecast day forward:', pred_day, '| forecast date:', pred_date) 
    spred_data = spred[(spred['date'] >= pred_date_back) & (spred['date'] <= pred_date)].copy()
    spred_data = make_features(spred_data)
    spred_data = spred_data.loc[spred['date'] == pred_date, train_cols]
    spred_data[cat_cols] = spred_data[cat_cols].fillna(0)
    spred.loc[spred['date'] == pred_date, 'sales'] = model.predict(spred_data)
del spred_data
gc.collect()

read train: (30490, 1919)
first day is: 1475
last day is: 1913
melted train: (3567330, 8)
read calendar: (1969, 14)
calendar merge done
read prices: (6841121, 4)
prices merge done
begin train date: 2016-02-24 00:00:00
end train date: 2016-06-19 00:00:00
date cut train: (3567330, 22)
cut train date: 2016-02-24 00:00:00
end train date: 2016-06-19 00:00:00


HBox(children=(IntProgress(value=0, max=56), HTML(value='')))

----------------------------------------------------------------------
forecast day forward: 1 | forecast date: 2016-04-25 00:00:00
in dataframe: (1890380, 22)
lag sales done
rolling mean sales for feature done: id
rolling mean sales for feature done: item_id
date features done
out dataframe: (1890380, 35)
----------------------------------------------------------------------
forecast day forward: 2 | forecast date: 2016-04-26 00:00:00
in dataframe: (1890380, 22)
lag sales done
rolling mean sales for feature done: id
rolling mean sales for feature done: item_id
date features done
out dataframe: (1890380, 35)
----------------------------------------------------------------------
forecast day forward: 3 | forecast date: 2016-04-27 00:00:00
in dataframe: (1890380, 22)
lag sales done
rolling mean sales for feature done: id
rolling mean sales for feature done: item_id
date features done
out dataframe: (1890380, 35)
----------------------------------------------------------------------
forec

## Submission

In [14]:
spred_subm = spred.loc[spred['date'] > END_DATE, ['id', 'd', 'sales']].copy()
last_d = int(spred.loc[spred['date'] == END_DATE, 'd'].unique()[0].replace('d_', ''))
print('last d num:', last_d)
spred_subm['d'] = spred_subm['d'].apply(lambda x: 'F{}'.format(int(x.replace('d_', '')) - last_d))
spred_subm.loc[spred_subm['sales'] < 0, 'sales'] = 0

last d num: 1913


In [0]:
f_cols = ['F{}'.format(x) for x in range(1, 28 + 28 + 1)]
spred_subm = spred_subm.set_index(['id', 'd']).unstack()['sales'][f_cols].reset_index()
spred_subm.fillna(0, inplace=True)
spred_subm.sort_values('id', inplace=True)
spred_subm.reset_index(drop=True, inplace=True)

In [16]:
f_cols_val = ['F{}'.format(x) for x in range(1, 28 + 1)]
f_cols_eval = ['F{}'.format(x) for x in range(28 + 1, 28 + 28 + 1)]
spred_subm_eval = spred_subm.copy()
spred_subm.drop(columns=f_cols_eval, inplace=True)
spred_subm_eval.drop(columns=f_cols_val, inplace=True)
spred_subm_eval.columns = spred_subm.columns
spred_subm_eval['id'] = spred_subm_eval['id'].str.replace('validation', 'evaluation')
spred_subm = pd.concat([spred_subm, spred_subm_eval], axis=0, sort=False)
spred_subm.reset_index(drop=True, inplace=True)
spred_subm.to_csv('submission.csv', index=False)
print('submission saved:', spred_subm.shape)

submission saved: (60980, 29)
