# SETTINGS

The notebook implements stacking ensemble of predictions coming from different variants of the LightGBM models implemented in `notebook_03_modeling.ipynb` over the course of working on the project. 

Stacking is implemented using higher-level LightGBM models. The ensembled predictions are exported as `sub_stack_[name].csv`.

In [None]:
##### LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats
from scipy.stats import gmean

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm
import importlib
import sys

from sklearn.model_selection import KFold

import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
##### SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# IMPORT PREDICTIONS

In [None]:
##### IMPORT OOF PREDS

# version threshold
min_lgb_version = 17
min_df_version  = 12

# prepare model names
models = os.listdir('../oof_preds')
models = [m for m in models if m != '.DS_Store']
models = [m for m in models if int(m.split('_')[1][1:]) >= min_lgb_version]
models = [m for m in models if int(m.split('_')[3][1:]) >= min_df_version]
models = [m.replace('.npy', '') for m in models]
models = sorted(models)
print('OOF predictions:', len(models))
models

In [None]:
# preprocessing loop
for m in models:

    # load preds
    tmp_tr = np.load('../oof_preds/'           + m + '.npy')
    tmp_te = pd.read_csv('../submissions/sub_' + m + '.csv', sep = '|')

    # split OOF preds
    tmp_preds_oof = tmp_tr[0]
    tmp_preds_oof = pd.DataFrame(tmp_preds_oof.reshape(-1))
    if m == models[0]:
        reals_oof = tmp_tr[1]
        reals_oof = pd.DataFrame(reals_oof.reshape(-1))
        
    # split ID from test preds
    if m == models[0]:
        id_test = tmp_te[['itemID']]
    tmp_te = tmp_te[['demandPrediction']]

    # rename columns
    reals_oof.columns     = ['target']
    tmp_preds_oof.columns = [m]    
    tmp_te.columns        = [m]  

    # stack preds
    if m == models[0]:     
        preds_oof  = tmp_preds_oof
        preds_test = tmp_te
    else:
        preds_oof  = pd.concat([preds_oof,  tmp_preds_oof], axis = 1)
        preds_test = pd.concat([preds_test, tmp_te],        axis = 1)
        
# extract OOF prices and targets
reals_oof  = tmp_tr[1].reshape(-1)
prices_oof = tmp_tr[2].reshape(-1)
        
# display information
print('- Train shape:', preds_oof.shape)
print('- Test shape:',  preds_test.shape)

In [None]:
# rename objects
y        = pd.Series(reals_oof.copy())
X        = preds_oof.copy()
X_prices = pd.Series(prices_oof).copy()
X_test   = preds_test.copy()

In [None]:
# read items
items = pd.read_csv('../data/prepared/items_v1.csv', compression = 'gzip')
print(items.shape)

# STACKING

In [None]:
##### MODULES

sys.path.append('../codes')  

from evaluation import asymmetric_mse, asymmetric_mse_eval, profit
from postprocessing import postprocess_preds

In [None]:
##### LIST RELEVANT FEATURES

drop_feats = []
features = [var for var in X.columns if var not in drop_feats]
print(len(features), 'features')
features

In [None]:
##### MODELING PARAMETERS

# random seed
seed = 777

# cross-validation
num_folds = 5
shuffle   = True

# rounds and options
cores       = 4
stop_rounds = 100
verbose     = 100

# LGB parameters
lgb_params = {
    'boosting_type':    'goss',
    'objective':        'rmse',#asymmetric_mse,
    'metrics':          'rmse',
    'n_estimators':     1000,
    'learning_rate':    0.1,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8,
    'lambda_l1':        0.1,
    'lambda_l2':        0.1,
    'silent':           True,
    'verbosity':        -1,
    'nthread' :         cores,
    'random_state':     seed,
}

# data partitioning
folds = KFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
##### CROSS-VALIDATION LOOP

# placeholders
oof_profit = []
preds_test = np.zeros(items.shape[0])
preds_oof  = np.zeros(X.shape[0])
reals_oof  = np.zeros(X.shape[0])
oof_rmse      = []
oof_profit    = []
oracle_profit = []

# cross-validation
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    # data partitioning
    X_train, y_train = X[features].iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X[features].iloc[val_idx], y.iloc[val_idx]
    
    # training
    clf = lgb.LGBMRegressor(**lgb_params) 
    clf = clf.fit(X_train, y_train, 
                  eval_set              = [(X_train, y_train), (X_valid, y_valid)], 
                  eval_metric           = 'rmse',
                  sample_weight         = X_prices.iloc[trn_idx].values,
                  eval_sample_weight    = [X_prices.iloc[trn_idx].values, X_prices.iloc[val_idx].values],
                  early_stopping_rounds = stop_rounds,
                  verbose               = verbose)
       
    # predictions
    reals_oof[val_idx] = y_valid
    preds_oof[val_idx] = postprocess_preds(clf.predict(X_valid))
    preds_test        += postprocess_preds(clf.predict(X_test)) / num_folds
    
    # evaluation
    oof_rmse.append(np.sqrt(mean_squared_error(reals_oof[val_idx], preds_oof[val_idx])))
    oof_profit.append(profit(reals_oof[val_idx],    preds_oof[val_idx], price = X_prices.iloc[val_idx].values))
    oracle_profit.append(profit(reals_oof[val_idx], reals_oof[val_idx], price = X_prices.iloc[val_idx].values))
    
    # information
    print('-' * 65)
    print('FOLD {:d}/{:d}: RMSE = {:.2f}, PROFIT = {:.0f}'.format(fold + 1, 
                                                                  num_folds, 
                                                                  oof_rmse[fold], 
                                                                  oof_profit[fold]))
    print('-' * 65)
    print('')

    
# print performance
oof_rmse      = np.sqrt(mean_squared_error(reals_oof, preds_oof))    
oof_profit    = profit(reals_oof, preds_oof, price = X_prices.values) / tmp_tr.shape[1]
oracle_profit = profit(reals_oof, reals_oof, price = X_prices.values) / tmp_tr.shape[1]
print('')
print('-' * 65)
print('- AVERAGE RMSE:   {:.2f}'.format(np.mean(oof_rmse)))
print('- AVERAGE PROFIT: {:.0f} ({:.2f}%)'.format(np.mean(oof_profit), 100 * np.mean(oof_profit) / np.mean(oracle_profit)))
print('-' * 65)

# SUBMISSION

In [None]:
##### SUBMISSION

# model name
sub_name = 'stack_' + str(len(models)) + 'preds'

# save submissiion
sub = pd.read_csv('../submissions/sample_submission.csv', sep = '|')
sub['demandPrediction'] = postprocess_preds(preds_test)
sub.to_csv('../submissions/sub_' + sub_name + '.csv', sep = '|', index = False)
print(sub.shape)
sub.head()