In [None]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from catboost import Pool, CatBoostRegressor
pd.set_option('display.max_columns', None)
print('available GPU devices:', len(os.environ['CUDA_VISIBLE_DEVICES']), 
      ' | device num:', os.environ['CUDA_VISIBLE_DEVICES'])
from catboost.utils import get_gpu_device_count
from tqdm.notebook import tqdm
print('available GPU devices catboost:', get_gpu_device_count())

In [None]:
DATA_DIR = './data'
MODELS_DIR = './models'
MODEL_VER = 'v0'
BACKWARD_LAGS = 60
CUT_DATE = '2014-01-01'
VAL_DATE = '2016-04-01'
END_DATE = '2016-04-24'
print(datetime.strptime(END_DATE, '%Y-%m-%d'))
#-----|CUT_DATE|---train---|VAL_DATE|--val--|END_DATE|--forecast +28 days-->

In [None]:
CALENDAR_DTYPES = {
    'date':             'str',
    'wm_yr_wk':         'int16', 
    'weekday':          'object',
    'wday':             'int16', 
    'month':            'int16', 
    'year':             'int16', 
    'd':                'object',
    'event_name_1':     'object',
    'event_type_1':     'object',
    'event_name_2':     'object',
    'event_type_2':     'object',
    'snap_CA':          'int16', 
    'snap_TX':          'int16', 
    'snap_WI':          'int16'
}
PARSE_DATES = ['date']
SPRICES_DTYPES = {
    'store_id':    'object', 
    'item_id':     'object', 
    'wm_yr_wk':    'int16',  
    'sell_price':  'float32'
}

In [None]:
def get_df(is_train=True, backward_lags=None):
    strain = pd.read_csv('{}/sales_train_validation.csv'.format(DATA_DIR))
    print('read train:', strain.shape)
    cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    last_day = int(strain.columns[-1].replace('d_', ''))
    print('last day is:', last_day)
    if not is_train:
        for day in range(last_day + 1, last_day + 28 + 28 + 1):
            strain['d_{}'.format(day)] = np.nan
    strain = pd.melt(
        strain,
        id_vars = cat_cols,
        value_vars = [col for col in strain.columns if col.startswith('d_')],
        var_name = 'd',
        value_name = 'sales'
    )
    print('melted train:', strain.shape)
    calendar = pd.read_csv('{}/calendar.csv'.format(DATA_DIR), dtype=CALENDAR_DTYPES, parse_dates=PARSE_DATES)
    print('read calendar:', calendar.shape)
    strain = strain.merge(calendar, on='d', copy=False)
    print('calendar merge done')
    sprices = pd.read_csv('{}/sell_prices.csv'.format(DATA_DIR), dtype=SPRICES_DTYPES)
    print('read prices:', sprices.shape)
    strain = strain.merge(
        sprices, 
        on=['store_id', 'item_id', 'wm_yr_wk'], 
        copy=False
    )
    print('prices merge done')
    print('begin train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    if not is_train:
        strain = strain.loc[
            strain['date'] >= (datetime.strptime(END_DATE, '%Y-%m-%d') - timedelta(days=backward_lags))
        ]
    else:
        strain = strain.loc[strain['date'] >= CUT_DATE]
    print('date cut train:', strain.shape)
    print('cut train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    return strain

In [None]:
def make_features(strain):
    print('in dataframe:', strain.shape)
    lags = [7, 28]
    windows= [7, 28]
    wnd_feats = [
        'id', 
        'item_id', 
        #'dept_id', 
        'store_id', 
        'cat_id', 
        #'state_id'
    ]
    lag_cols = ['lag_{}'.format(lag) for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        strain[lag_col] = strain[['id', 'sales']].groupby('id')['sales'].shift(lag)
    print('lag sales done')
    for wnd_feat in wnd_feats:
        for wnd in windows:
            for lag_col in lag_cols:
                wnd_col = '{}_{}_rmean_{}'.format(lag_col, wnd_feat, wnd)
                strain[wnd_col] = strain[[wnd_feat, lag_col]].groupby(wnd_feat)[lag_col].transform(
                    lambda x: x.rolling(wnd).mean()
                )
        print('rolling mean sales for feature done:', wnd_feat)
    date_features = {
        'week_num': 'weekofyear',
        'quarter': 'quarter',
        'mday': 'day'
    }
    for date_feat_name, date_feat_func in date_features.items():
        strain[date_feat_name] = getattr(strain['date'].dt, date_feat_func).astype('int16')
    print('date features done')
    strain['d'] = strain['d'].apply(lambda x: int(x.replace('d_', '')))  
    print('out dataframe:', strain.shape)
    return strain

In [None]:
%%time
strain = get_df(is_train=True, backward_lags=None)

In [None]:
%%time
strain = make_features(strain)

In [None]:
id_name = np.random.choice(strain['id'].unique())
print('id to draw:', id_name)
id_sales = strain.loc[strain['id'] == id_name].set_index('date')
print('from', strain['date'].min(), 'to', strain['date'].max()) 
plt.figure(figsize=(18, 4))
id_sales['sales'].plot(label='sales')
id_sales['lag_7'].plot(label='lag_7')
id_sales['lag_7_id_rmean_7'].plot(label='lag_7_id_rmean_7')
try:
    id_sales['lag_7_store_id_rmean_7'].plot(label='lag_7_store_id_rmean_7')
    id_sales['lag_7_cat_id_rmean_7'].plot(label='lag_7_cat_id_rmean_7')
except:
    print('no features')
    pass
plt.title(id_name)
plt.legend()
plt.show()

In [None]:
drop_cols = ['id', 'sales', 'date', 'wm_yr_wk', 'weekday']
train_cols = strain.columns[~strain.columns.isin(drop_cols)]
cat_cols = [
    'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 
    #'year', 'wday', 'month', 'quarter',
    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
    #'snap_CA', 'snap_TX', 'snap_WI'
]
strain[cat_cols] = strain[cat_cols].fillna(0)

In [None]:
%%time
if False: #True if last period as validation
    X_train = strain[strain['date'] <= VAL_DATE][train_cols]
    y_train = strain[strain['date'] <= VAL_DATE]['sales']
    X_val = strain[strain['date'] > VAL_DATE][train_cols]
    y_val = strain[strain['date'] > VAL_DATE]['sales']
else: #random sample for validation
    val_size = int(strain.shape[0] * .15)
    val_idxs = np.random.choice(strain.index.values, val_size, replace=False)
    train_idxs = np.setdiff1d(strain.index.values, val_idxs)
    X_train = strain.loc[train_idxs][train_cols]
    y_train = strain.loc[train_idxs]['sales']
    X_val = strain.loc[val_idxs][train_cols]
    y_val = strain.loc[val_idxs]['sales']
print('train shapes:', X_train.shape, len(y_train))
print('val shapes:', X_val.shape, len(y_val))
train_pool = Pool(
    X_train, 
    y_train,
    cat_features=cat_cols
)
val_pool = Pool(
    X_val, 
    y_val,
    cat_features=cat_cols
)

In [None]:
del X_train, y_train, X_val, y_val, val_idxs, train_idxs, val_size
gc.collect()

In [None]:
model = CatBoostRegressor(
    iterations=1000,
    task_type='GPU', #'CPU'
    verbose=0,
    loss_function='RMSE',
    boosting_type='Plain',
    depth=6,
    gpu_cat_features_storage='CpuPinnedMemory',
    #max_ctr_complexity=2
)
model.fit(
    train_pool,
    eval_set = val_pool,
    plot=True   
)

In [None]:
model.save_model('{}/model_{}.cbm'.format(MODELS_DIR, MODEL_VER))

In [None]:
model = CatBoostRegressor().load_model('{}/model_{}.cbm'.format(MODELS_DIR, MODEL_VER))

In [None]:
feat_importances = sorted(
    [(f, v) for f, v in zip(train_cols, model.get_feature_importance())],
    key=lambda x: x[1],
    reverse=True
)
threshold = .25
labels = [x[0] for x in feat_importances if x[1] > threshold]
values = [x[1] for x in feat_importances if x[1] > threshold]
fig, ax = plt.subplots(figsize=(8, 4))
y_pos = np.arange(len(labels))
ax.barh(y_pos, values)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.set_xlabel('Performance')
ax.set_title('feature importances')
plt.show()

In [None]:
spred = strain[strain['date'] >= VAL_DATE].copy()
preds = model.predict(spred[train_cols])
print('predictions done:', len(preds))
spred.loc[:, 'sales'] = preds

In [None]:
id_name = np.random.choice(strain['id'].unique())
print('id to draw:', id_name)
id_sales = strain.loc[(strain['id'] == id_name) & (strain['date'] >= VAL_DATE)].set_index('date')
id_sales_pred = spred.loc[spred['id'] == id_name].set_index('date')
print('from', strain['date'].min(), 'to', strain['date'].max()) 
plt.figure(figsize=(18, 4))
id_sales['sales'].plot(label='sales')
id_sales_pred['sales'].plot(label='sales prediction')
plt.title(id_name)
plt.legend()
plt.show()

In [None]:
%%time
growth_rate = 1
spred = get_df(is_train=False, backward_lags=BACKWARD_LAGS)
for pred_day in tqdm(range(1, 28 + 28 + 1)):
    pred_date = datetime.strptime(END_DATE, '%Y-%m-%d') + timedelta(days=pred_day)
    pred_date_back = pred_date - timedelta(days=BACKWARD_LAGS + 1)
    print('-' * 70)
    print('forecast day forward:', pred_day, '| forecast date:', pred_date) 
    spred_data = spred[(spred['date'] >= pred_date_back) & (spred['date'] <= pred_date)].copy()
    spred_data = make_features(spred_data)
    spred_data = spred_data.loc[spred['date'] == pred_date, train_cols]
    spred_data[cat_cols] = spred_data[cat_cols].fillna(0)
    spred.loc[spred['date'] == pred_date, 'sales'] = growth_rate * model.predict(spred_data)

In [None]:
id_name = np.random.choice(strain['id'].unique())
print('id to draw:', id_name)
id_sales = strain.loc[(strain['id'] == id_name) & (strain['date'] >= VAL_DATE)].set_index('date')
id_sales_pred = spred.loc[(spred['id'] == id_name) & (spred['date'] >= END_DATE)].set_index('date')
print('from', strain['date'].min(), 'to', spred['date'].max()) 
plt.figure(figsize=(18, 4))
id_sales['sales'].plot(label='sales')
id_sales_pred['sales'].plot(label='sales prediction')
plt.title(id_name)
plt.legend()
plt.show()

In [None]:
spred_subm = spred.loc[spred['date'] > END_DATE, ['id', 'd', 'sales']].copy()
last_d = int(spred.loc[spred['date'] == END_DATE, 'd'].unique()[0].replace('d_', ''))
print('last d num:', last_d)
spred_subm['d'] = spred_subm['d'].apply(lambda x: 'F{}'.format(int(x.replace('d_', '')) - last_d))
spred_subm.loc[spred_subm['sales'] < 0, 'sales'] = 0

In [None]:
f_cols = ['F{}'.format(x) for x in range(1, 28 + 28 + 1)]
spred_subm = spred_subm.set_index(['id', 'd']).unstack()['sales'][f_cols].reset_index()
spred_subm.fillna(0, inplace=True)
spred_subm.sort_values('id', inplace=True)
spred_subm.reset_index(drop=True, inplace=True)
spred_subm.head()

In [None]:
f_cols = ['F{}'.format(x) for x in range(1, 28 + 1)]
f_cols_eval = ['F{}'.format(x) for x in range(28 + 1, 28 + 28 + 1)]
spred_subm_eval = spred_subm.copy()
spred_subm.drop(columns=f_cols_eval, inplace=True)
spred_subm_eval.drop(columns=f_cols, inplace=True)
spred_subm_eval.columns = spred_subm.columns
spred_subm_eval['id'] = spred_subm_eval['id'].str.replace('validation', 'evaluation')
spred_subm = pd.concat([spred_subm, spred_subm_eval], axis=0, sort=False)
spred_subm.reset_index(drop=True, inplace=True)
spred_subm.to_csv('submission.csv', index=False)
print('submission saved:', spred_subm.shape)