# M5-forecasting, CatBoost.

In [1]:
!pip install catboost
!jupyter nbextension enable --py widgetsnbextension

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/51/fb/6eddb2667ee55c39bd12ab6676c412726fcbc86e1c0364fda8e6c3a62bc3/catboost-0.23.1-cp36-none-manylinux1_x86_64.whl (64.7MB)
[K     |████████████████████████████████| 64.7MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.23.1
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [0]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd


from catboost import Pool, CatBoostRegressor
from catboost.utils import get_gpu_device_count
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [3]:
print('GPU devices СatBoost:', get_gpu_device_count())

GPU devices СatBoost: 1


In [4]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


В таблицах представлены данные в период с `2011-01-29` по `2016-06-19`. Тестовый период размера `h=28` не включен, его и требуется предсказать.
Использование всех доступных данных за 5 лет имеет недостатки, т.к. значительно увеличивается время обучения модели, работа с таблицами и создания новых фичей. К тому же в таблицах достаточно товаров, которые не продовались в 2011-2013 годах. Будем строить предсказания, опираясь на 2 года назад.

In [5]:
DATA_DIR = '/gdrive/My Drive/M5-forecasting/data'
MODELS_DIR = '/gdrive/My Drive/M5-forecasting/models/'
SUBMS_DIR = '/gdrive/My Drive/M5-forecasting/submissions/'

MODEL_VER = 'v5'
BACKWARD_LAGS = 60

END_D = 1913
CUT_D = 350

END_DATE = '2016-04-24'
END_DATE_DT = datetime.strptime(END_DATE, '%Y-%m-%d')
print(datetime.strptime(END_DATE, '%Y-%m-%d'))
np.random.seed(0)

2016-04-24 00:00:00


## Загрузка данных

In [0]:
CALENDAR_DTYPES = {
    'date':             'str',
    'wm_yr_wk':         'int16', 
    'weekday':          'object',
    'wday':             'int16', 
    'month':            'int16', 
    'year':             'int16', 
    'd':                'object',
    'event_name_1':     'object',
    'event_type_1':     'object',
    'event_name_2':     'object',
    'event_type_2':     'object',
    'snap_CA':          'int16', 
    'snap_TX':          'int16', 
    'snap_WI':          'int16'
}

PARSE_DATES = ['date']

SPRICES_DTYPES = {
    'store_id':    'object', 
    'item_id':     'object', 
    'wm_yr_wk':    'int16',  
    'sell_price':  'float32'
}

In [0]:
def get_dataframe(
    is_train:bool=True, 
    backward_lags:int=28
) -> pd.DataFrame:
    sales_train = pd.read_csv('{}/sales_train_validation.csv'.format(DATA_DIR))
    print('sales_train_validation.csv:', sales_train.shape)

    cat_cols = ['id', 'item_id', 'dept_id',
                'store_id', 'cat_id', 'state_id']

    last_day = int(sales_train.columns[-1].replace('d_', ''))
    print('First day:', CUT_D)
    print('Last day:', last_day)

    if not is_train:
        """
        Если находимся на этапе предсказания, то формируем новые столбцы 
        вида d_{} для последующих предсказаний и сохраняем value_vars, 
        по которым сделаем pd.melt
        """
        for day in range(last_day + 1, last_day + 28 + 1):
            sales_train['d_{}'.format(day)] = np.nan
        value_vars = [col for col in sales_train.columns 
                      if (col.startswith('d_') and (int(col.replace('d_', '')) >= END_D - backward_lags))]
    else:
        value_vars = [col for col in sales_train.columns 
                      if (col.startswith('d_') and (int(col.replace('d_', '')) >= CUT_D))]

    sales_train = pd.melt(
        sales_train,
        id_vars = cat_cols,
        value_vars = value_vars,
        var_name = 'd',
        value_name = 'sales'
    )
    print('melted train:', sales_train.shape)

    calendar = pd.read_csv('{}/calendar.csv'.format(DATA_DIR), 
                           dtype=CALENDAR_DTYPES, 
                           parse_dates=PARSE_DATES)
    print('calendar.csv:', calendar.shape)

    sales_train = sales_train.merge(calendar, on='d', copy=False)
    print('calendar merge done')

    del calendar
    gc.collect()

    sales_prices = pd.read_csv('{}/sell_prices.csv'.format(DATA_DIR), dtype=SPRICES_DTYPES)
    print('read prices:', sales_prices.shape)

    sales_train = sales_train.merge(
        sales_prices, 
        on=['store_id', 'item_id', 'wm_yr_wk'], 
        copy=False
    )

    del sales_prices
    gc.collect()
    print('prices merge done')
    print('begin train date:', sales_train['date'].min())
    print('end train date:', sales_train['date'].max())
    if not is_train:
        """
        На этапе предсказания оставляем только последние 
        backward_lags дней из исходного датасета
        """
        sales_train = sales_train.loc[
            sales_train['date'] >= (datetime.strptime(END_DATE, '%Y-%m-%d') - timedelta(days=backward_lags))
        ]
    print('date cut train:', sales_train.shape)
    print('cut train date:', sales_train['date'].min())
    print('end train date:', sales_train['date'].max())
    return sales_train

In [0]:
def make_features(
    strain: pd.DataFrame
) -> pd.DataFrame:
    print('in dataframe:', strain.shape)
    lags = [7, 28]
    windows= [7, 28]
    wnd_feats = ['id', 'item_id']
    lag_cols = ['lag_{}'.format(lag) for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        strain[lag_col] = strain[['id', 'sales']].groupby('id')['sales'].shift(lag)
    print('lag sales done')

    for wnd_feat in wnd_feats:
        for wnd in windows:
            for lag_col in lag_cols:
                wnd_col = '{}_{}_rmean_{}'.format(lag_col, wnd_feat, wnd)
                strain[wnd_col] = strain[[wnd_feat, lag_col]].groupby(wnd_feat)[lag_col].transform(
                    lambda x: x.rolling(wnd).mean()
                )
        print('rolling mean sales for feature done:', wnd_feat)

    date_features = {
        'week_num': 'weekofyear',
        'quarter': 'quarter',
        'mday': 'day'
    }

    for date_feat_name, date_feat_func in date_features.items():
        strain[date_feat_name] = getattr(strain['date'].dt, date_feat_func).astype('int16')
    print('date features done')
    
    strain['d'] = strain['d'].apply(lambda x: int(x.replace('d_', '')))  
    print('out dataframe:', strain.shape)
    return strain

In [0]:
%%time
strain = get_dataframe(is_train=True, backward_lags=None)
strain = make_features(strain)

In [0]:
drop_cols = ['id', 'sales', 'date', 'wm_yr_wk', 'weekday']
train_cols = strain.columns[~strain.columns.isin(drop_cols)]
cat_cols = [
    'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 
    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
]
strain[cat_cols] = strain[cat_cols].fillna('Unknown')
strain.dropna(inplace=True)

## CatBoost

In [0]:
%%time
val_size = int(strain.shape[0] * 0.05)
val_idxs = np.random.choice(strain.index.values, 
                            val_size,
                            replace=False)
train_idxs = np.setdiff1d(strain.index.values, val_idxs)

train_pool = Pool(
    strain.loc[train_idxs][train_cols], 
    strain.loc[train_idxs]['sales'],
    cat_features=cat_cols
)
val_pool = Pool(
    strain.loc[val_idxs][train_cols], 
    strain.loc[val_idxs]['sales'],
    cat_features=cat_cols
)
del strain
gc.collect()

CPU times: user 59.8 s, sys: 2.09 s, total: 1min 1s
Wall time: 54.5 s


In [0]:
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.075,
    l2_leaf_reg = 0.1,
    task_type='GPU',
    verbose=200,
    loss_function='RMSE',
    boosting_type='Plain',
    depth=8,
    random_state=0
)

In [0]:
model.fit(
    train_pool,
    eval_set = val_pool,
    plot=True   
)
del train_pool, val_pool
gc.collect()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3.4873576	test: 3.4838977	best: 3.4838977 (0)	total: 4.13s	remaining: 2h 17m 27s
200:	learn: 2.0941499	test: 2.1034626	best: 2.1034626 (200)	total: 18m 32s	remaining: 2h 45m 59s
400:	learn: 2.0546126	test: 2.0847609	best: 2.0847609 (400)	total: 36m 27s	remaining: 2h 25m 22s
600:	learn: 2.0296528	test: 2.0736687	best: 2.0736687 (600)	total: 54m 34s	remaining: 2h 7m 1s
800:	learn: 2.0088609	test: 2.0668745	best: 2.0668745 (800)	total: 1h 12m 7s	remaining: 1h 47m 57s
1000:	learn: 1.9924412	test: 2.0610395	best: 2.0610395 (1000)	total: 1h 29m 49s	remaining: 1h 29m 38s
1200:	learn: 1.9774924	test: 2.0554304	best: 2.0554304 (1200)	total: 1h 47m 26s	remaining: 1h 11m 28s
1400:	learn: 1.9658091	test: 2.0515876	best: 2.0515876 (1400)	total: 2h 5m 26s	remaining: 53m 37s
1600:	learn: 1.9542550	test: 2.0491041	best: 2.0490105 (1575)	total: 2h 23m 16s	remaining: 35m 42s
1800:	learn: 1.9438727	test: 2.0454635	best: 2.0454635 (1800)	total: 2h 41m 15s	remaining: 17m 49s
1999:	learn: 1.933955

0

In [0]:
model.save_model(MODELS_DIR+'catboost_model_lag-mean-fe_{}.cbm'.format(MODEL_VER))

In [0]:
model = CatBoostRegressor()
model.load_model(MODELS_DIR+'catboost_model_lag-mean-fe_{}.cbm'.format(MODEL_VER))

<catboost.core.CatBoostRegressor at 0x7f066b61ff28>

In [0]:
def make_features_prediction(
    strain: pd.DataFrame,
    day
) -> pd.DataFrame:
    print('in dataframe:', strain.shape)
    lags = [7, 28]
    windows= [7, 28]
    wnd_feats = ['id', 'item_id']
    lag_cols = ['lag_{}'.format(lag) for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        indexes = strain.loc[strain.date == day, lag_col].index
        strain.loc[strain.date == day, lag_col] = strain.loc[strain.date >= day - timedelta(days=lag), ['id', 'sales']].groupby('id')['sales'].shift(lag)[indexes]

    for wnd_feat in wnd_feats:
        for wnd in windows:
            for lag_col in lag_cols:
                wnd_col = '{}_{}_rmean_{}'.format(lag_col, wnd_feat, wnd)
                indexes = strain.loc[strain.date == day, wnd_col].index
                strain.loc[strain.date == day, wnd_col] = strain.loc[strain.date >= day - timedelta(days=wnd), [wnd_feat, lag_col]].groupby(wnd_feat)[lag_col].transform(
                    lambda x: x.rolling(wnd).mean()
                )[indexes]
    return strain

In [0]:
df = get_dataframe(False, BACKWARD_LAGS)
df = make_features(df)
df[cat_cols] = df[cat_cols].fillna('Unknown')

## Prediction Loop and Submission

In [0]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1 / len(alphas)] * len(alphas)
sub = 0.

for icount, (alpha, weight) in tqdm(enumerate(zip(alphas, weights))):

    df = get_dataframe(False, BACKWARD_LAGS)
    df = make_features(df)
    df[cat_cols] = df[cat_cols].fillna('Unknown')

    cols = [f"F{i}" for i in range(1, 29)]
    for tdelta in range(0, 28):
        day = END_DATE_DT + timedelta(days=tdelta)
        print(f'Forecast day: {day}')
        test_df = df[(df.date >= day - timedelta(days=BACKWARD_LAGS)) & (df.date <= day)].copy()
        test_df = make_features_prediction(test_df, day)
        test_df = test_df.loc[test_df.date == day, train_cols]
        df.loc[df.date == day, "sales"] = alpha * model.predict(test_df)



    df_sub = df.loc[df.date >= END_DATE, ["id", "sales"]].copy()

    df_sub["F"] = [f"F{rank}" for rank in df_sub.groupby("id")["id"].cumcount()+1]
    df_sub = df_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    df_sub.fillna(0., inplace = True)
    df_sub.sort_values("id", inplace = True)
    df_sub.reset_index(drop=True, inplace = True)
    df_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = df_sub
        sub[cols] *= weight
    else:
        sub[cols] += df_sub[cols] * weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(SUBMS_DIR+"submission_{}.csv".format(MODEL_VER), index=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

read train: (30490, 1919)
first day is: 1183
last day is: 1913
melted train: (3567330, 8)
read calendar: (1969, 14)
calendar merge done
read prices: (6841121, 4)
prices merge done
begin train date: 2016-02-24 00:00:00
end train date: 2016-06-19 00:00:00
date cut train: (3567330, 22)
cut train date: 2016-02-24 00:00:00
end train date: 2016-06-19 00:00:00
in dataframe: (3567330, 22)
lag sales done
rolling mean sales for feature done: id
rolling mean sales for feature done: item_id
date features done
out dataframe: (3567330, 35)
Forecast day: 2016-04-24 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-25 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-26 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-27 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-28 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-29 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-04-30 00:00:00
in dataframe: (1859890, 35)
Forecast day: 2016-05-01 00:00:00
