# Light GBM

Notebook used as example: https://www.kaggle.com/robertburbidge/lightgbm-poisson-w-scaled-pinball-loss

For this competition we used Light GBM, a gradient boosting framework that uses tree based learning algorithm.

In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns; sns.set()
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import gc
from scipy.sparse import csr_matrix

import os 

Function for memory reduction, this speeds up the notebook. Also convenient for preventing the uses of all available RAM.

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Load data provided by the M5 Forecasting competition and reduce the memory.

In [None]:
calendar = reduce_mem_usage(pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv'))
sell_prices = reduce_mem_usage(pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv'))
sales_train_eval = reduce_mem_usage(pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv'))
submission = reduce_mem_usage(pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv'))

The function encode_categorical, encodes the non NaN strings in the specified columns to integers. 

In [None]:
def encode_categorical(df, cols):
  for col in cols:
    le = LabelEncoder()
    not_null = df[col][df[col].notnull()]
    df[col] = pd.Series(le.fit_transform(not_null), index = not_null.index)
  return df

Data preprocessing before training.

In [None]:
NUM_ITEMS = sales_train_eval.shape[0]
DAYS_PRED = submission.shape[1] - 1 

# Encode the specified columns
calendar = encode_categorical(calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)
sales_train_eval = encode_categorical(sales_train_eval, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
sell_prices = encode_categorical(sell_prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)

nrows = 365 * 1 * NUM_ITEMS

# Reshape the dataframe sales_train_eval to a dataframe with the existing columns id, item_id, dept_id, cat_id, store_it and state_id 
# and in addition the new columns day and demand instead of a column for each day
sales_train_eval = pd.melt(sales_train_eval, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
sales_train_eval = sales_train_eval.iloc[-nrows:,:]

# Add rows for forecasting
forecast_submission = submission

validation_rows = [row for row in forecast_submission['id'] if 'validation' in row]
evaluation_rows = [row for row in forecast_submission['id'] if 'evaluation' in row]

validation = forecast_submission[forecast_submission['id'].isin(validation_rows)]
evaluation = forecast_submission[forecast_submission['id'].isin(evaluation_rows)]

validation.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)] 
evaluation.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)] 

product = sales_train_eval[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

validation = validation.merge(product, how = 'left', on = 'id')
evaluation = evaluation.merge(product, how = 'left', on = 'id')

validation = pd.melt(validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
evaluation = pd.melt(evaluation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')

sales_train_eval['part'] = 'train'
validation['part'] = 'test1'
evaluation['part'] = 'test2'

# Add validation and evaluation rows to the sales_train_eval dataframe
data = pd.concat([sales_train_eval, validation, evaluation], axis = 0)
data = reduce_mem_usage(data)

del validation, evaluation, sales_train_eval, forecast_submission
gc.collect()

calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)

data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
data.drop(['d', 'day'], inplace = True, axis = 1)

del  calendar
gc.collect()

data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

del  sell_prices
gc.collect()

Select train and test data.

In [None]:
data = reduce_mem_usage(data)

x_train = data[data['date'] <= '2016-05-22']
y_train = x_train['demand']
test = data[data['date'] > '2016-05-22']

Train with LGBM and WRMSSE.

In [None]:
features = ["item_id", "dept_id", "cat_id", "store_id", "state_id", 
            "event_name_1", "event_type_1", "event_name_2", "event_type_2",
            "snap_CA", "snap_TX", "snap_WI", "sell_price"]

weight_mat = np.c_[np.identity(NUM_ITEMS).astype(np.int8), np.ones([NUM_ITEMS, 1]).astype(np.int8),  
                   pd.get_dummies(product.state_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.cat_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.dept_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.cat_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.dept_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str) + product.cat_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str) + product.dept_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.item_id.astype(str), drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.item_id.astype(str), drop_first=False).astype('int8').values
].T

weight_mat_csr = csr_matrix(weight_mat)
del weight_mat
gc.collect()

def weight_calc(data):
    sales_train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')
    
    day = ['d_' + str(i + 1) for i in range(1913)]
    
    sales_train_eval = weight_mat_csr * sales_train_eval[day].values
    
    df_temp = ((sales_train_eval > 0) * np.tile(np.arange(1, 1914), (weight_mat_csr.shape[0], 1)))

    start_no = np.min(np.where(df_temp == 0, 9999, df_temp), axis=1) - 1
    
    weight1 = np.sum((np.diff(sales_train_eval, axis=1) ** 2), axis=1) / (1913 - start_no)
    
    df_temp = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    df_temp['amount'] = df_temp['demand'] * df_temp['sell_price']
    df_temp = df_temp.groupby(['id'])['amount'].apply(np.sum).values
   
    weight2 = weight_mat_csr * df_temp
    weight2 = weight2 / np.sum(weight2)
    
    del sales_train_eval
    gc.collect()
    
    return weight1, weight2

weight1, weight2 = weight_calc(data)

def wrmsse(preds, data):
    y_true = np.array(data.get_label())
    
    num_col = len(y_true) // NUM_ITEMS

    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
    
    x_name = ['pred_' + str(i) for i in range(num_col)]
    x_name2 = ["act_" + str(i) for i in range(num_col)]
    
    train = np.array(weight_mat_csr * np.c_[reshaped_preds, reshaped_true])
    
    score = np.sum(np.sqrt(np.mean(np.square(train[:, :num_col] - train[:, num_col:]), axis=1) / weight1) * weight2)
    
    return 'wrmsse', score, False

params = {
    'boosting_type': 'gbdt',
    'metric': 'custom',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10,
    'colsample_bytree': 0.75
}

train_data = lgb.Dataset(x_train[features], y_train)

model = lgb.train(params, train_data, num_boost_round=2500, early_stopping_rounds=50, valid_sets=[train_data], verbose_eval=100, feval=wrmsse)

y_test_pred = model.predict(test[features], num_iteration=model.best_iteration)

Concatenate the evaluations predictions to the validation real data for the submission file.

In [None]:
test['demand'] = y_test_pred

# Get the predictions for the evaluation rows for submission
predictions = test[['id', 'date', 'demand']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

# Get the real data for the validation rows for submission 
eval_data = data[['id', 'date', 'demand']]
eval_data = eval_data[(eval_data['date'] > '2016-04-24') & (eval_data['date'] <= '2016-05-22')]
eval_data = pd.pivot(eval_data, index='id', columns='date', values='demand').reset_index()
eval_data.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
eval_data = eval_data[~eval_data['id'].str.contains("validation")]
eval_data['id'] = eval_data['id'].str.replace("_evaluation", "_validation")

predictions = predictions.append(eval_data, ignore_index=True)
predictions.to_csv('../output/submission-accuracy/submission.csv', index = False)