# SETTINGS

The notebook calculates simple blending ensembles of predictions coming from different variants of the LightGBM models implemented in `notebook_03_modeling.ipynb` over the course of working on the project. 

The ensembled predictions are exported as `sub_[name].csv`.

In [1]:
##### LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats
from scipy.stats import gmean

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm
import importlib
import sys

In [2]:
##### SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# IMPORT PREDICTIONS

In [None]:
##### IMPORT OOF PREDS

# version threshold
min_lgb_version = 17
min_df_version  = 12

# prepare model names
models = os.listdir('../oof_preds')
models = [m for m in models if m != '.DS_Store']
models = [m for m in models if int(m.split('_')[1][1:]) >= min_lgb_version]
models = [m for m in models if int(m.split('_')[3][1:]) >= min_df_version]
models = [m.replace('.npy', '') for m in models]
models = sorted(models)
print('OOF predictions:', len(models))
models

In [None]:
# preprocessing loop
for m in models:

    # load preds
    tmp_tr = np.load('../oof_preds/'           + m + '.npy')
    tmp_te = pd.read_csv('../submissions/sub_' + m + '.csv', sep = '|')

    # split OOF preds
    tmp_preds_oof = tmp_tr[0]
    tmp_preds_oof = pd.DataFrame(tmp_preds_oof.reshape(-1))
    if m == models[0]:
        reals_oof = tmp_tr[1]
        reals_oof = pd.DataFrame(reals_oof.reshape(-1))
        
    # split ID from test preds
    if m == models[0]:
        id_test = tmp_te[['itemID']]
    tmp_te = tmp_te[['demandPrediction']]

    # rename columns
    reals_oof.columns     = ['target']
    tmp_preds_oof.columns = [m]    
    tmp_te.columns        = [m]  

    # stack preds
    if m == models[0]:     
        preds_oof  = tmp_preds_oof
        preds_test = tmp_te
    else:
        preds_oof  = pd.concat([preds_oof,  tmp_preds_oof], axis = 1)
        preds_test = pd.concat([preds_test, tmp_te],        axis = 1)
        
# extract OOF prices and targets
reals_oof  = tmp_tr[1].reshape(-1)
prices_oof = tmp_tr[2].reshape(-1)
        
# display information
print('- Train shape:', preds_oof.shape)
print('- Test shape:',  preds_test.shape)

# BLENDING

In [None]:
##### MODULES

sys.path.append('../codes')  

from evaluation import profit
from postprocessing import postprocess_preds

In [None]:
##### COMPUTE BLENDS

# simple ensembles
amean  = np.array(preds_oof.mean(axis    = 1))
gemean = gmean(np.array(preds_oof), axis = 1)
median = np.array(preds_oof.median(axis  = 1))
mmin   = np.array(preds_oof.min(axis     = 1))
mmax   = np.array(preds_oof.max(axis     = 1))

# weighted mean
model_profits = [int(m.split('_')[5]) for m in models]
model_weights = [m / sum(model_profits) for m in model_profits]
wmean         = np.average(np.array(preds_oof), axis = 1, weights = model_weights)

In [None]:
# check profit on training data
for blend in [amean, gemean, median, mmin, mmax, wmean]:
    blend   = postprocess_preds(blend)
    profits = profit(reals_oof, blend, price = prices_oof)
    profits = profits / tmp_tr.shape[1]
    print('- blend profit:', np.round(profits).astype('int'))

# check individual profits
model_profits = [int(m.split('_')[5]) for m in models]
print('')
print('- max individual profit:', max(model_profits))

In [None]:
# compute best ensemble
blend = np.array(preds_test.median(axis = 1))
blend = postprocess_preds(blend)

# SUBMISSION

In [None]:
##### SUBMISSION

# model name
name     = 'median'
sub_name = 'blend_' + name + str(len(models)) + 'preds'

# save submissiion
sub = pd.read_csv('../submissions/sample_submission.csv', sep = '|')
sub['demandPrediction'] = blend
sub.to_csv('../submissions/sub_' + sub_name + '.csv', sep = '|', index = False)
print(sub.shape)
sub.head()