In [None]:
# import pandas and numpy
import pandas as pd
import numpy as np
import itertools
from lightgbm import LGBMRegressor

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print(get_df_name(df), ' Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'device': 'gpu',
        'subsample_freq': 1,
        'min_child_weight': 1,
        'learning_rate': 0.03,
        'num_leaves': 2 ** 11 - 1,
        'min_data_in_leaf': 2 ** 12 - 1,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'n_estimators': 1400,
        'boost_from_average': False,
        'verbosity': -1
        }

raw_data = pd.read_csv(r'data\sales_train_evaluation.csv')
calendar = pd.read_csv(r'data\calendar.csv')
sell_prices = pd.read_csv(r'data\sell_prices.csv')
submission = pd.read_csv(r'data\sample_submission.csv')

Feature Engineering

In [None]:
grid = pd.read_pickle(r'df_nofeat_noscale.pkl')
grid.date = grid.date.astype('datetime64[ns]')

grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
grid['tm_w'] = grid['date'].dt.week.astype(np.int8)
grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
grid['tm_y'] = grid['date'].dt.year
grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)

horizon = 0
print('************ ROLLING MEANS ************')
grp = grid.groupby(['id'], group_keys=False)['sales']
for roll in [7,14,30,60,180]:
    grid['rm_' + str(roll)] = grp.apply(lambda x: x.shift(horizon).rolling(roll).mean())
    
print('************ ROLLING STATS ************')
for roll in [7,14,30,60,180]:
    grid['max_' + str(roll)] = grp.apply(lambda x: x.shift(horizon).rolling(roll).max())
#     grid['min_' + str(roll)] = grp.apply(lambda x: x.shift(horizon).rolling(roll).min())
    grid['std_' + str(roll)] = grp.apply(lambda x: x.shift(horizon).rolling(roll).std())
#     grid['median_' + str(roll)] = grp.apply(lambda x: x.shift(horizon).rolling(roll).median())

print('************ DIFF MEANS ************')
for l in [7,56,140]:
    grid['diff_rm_' + str(l)] = grp.apply(lambda x : x.shift(horizon).diff().rolling(l).mean())

grp = grid.groupby(['id'], group_keys=False)['sales']
print('************ LAGS ************')
for lag in [1,2,3,4,5,6,7]:
    grid['lag_' + str(lag)] = grp.apply(lambda x: x.shift(lag))
    
print('************ ROLLING ZEROS ************')
for roll in [7,56,140]:
    grid['is_zero'] = [1 if sales == 0 else 0 for sales in grid['sales']]
    grp = grid.groupby(['id'], group_keys=False)['is_zero']
    grid['rolling_zero_' + str(roll)] = grp.apply(lambda x : x.shift(horizon).rolling(roll).sum())
    grid = grid.drop('is_zero', axis = 1)   
   
print('************ ROLLING PRICES ************')
# # # We can do some basic aggregations
grp = grid.groupby(['id'], group_keys=False)['sell_price']
grid['price_max'] = grp.apply(lambda x : x.expanding().max())
grid['price_min'] = grp.apply(lambda x : x.expanding().min())
grid['price_std'] = grp.apply(lambda x : x.expanding().std())
grid['price_mean'] = grp.apply(lambda x : x.expanding().mean())
# and do price normalization (min/max scaling)
grid['price_norm'] = grid['sell_price']/grid['price_max']
# Some items are can be inflation dependent
# and some items are very "stable"
grid['price_nunique'] = grid.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
grid['item_nunique'] = grid.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

# last two years of data
grid = grid.dropna()
grid = reduce_mem_usage(grid)
grid.shape
grid.to_pickle(r'data\df_feat_no_shift_2.pkl')

#### Stores & Dept models - Multi-step model

In [None]:
STEPS = [4,8,12,16,20,24,28] # Training/Prediction every 4 days (Compromise between very granular and long time horizon)
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
TARGET = ['sales']

horizon = 28
first_train_day = 1850
last_train_day = 1941 - horizon

first_val_day = last_train_day + 1
last_val_day = 1941

first_pred_day = 1941 + 1
train_days = last_train_day-first_train_day

train_start = first_train_day
train_end = last_train_day

predictions = pd.DataFrame()
remove_colums = ['item_id', 'store_id', 'state_id', 'id', 'd', 'date', 'weekday',
       'sales', 'year']

grid = pd.read_pickle(r'data\df_feat_no_shift_2.pkl')
train_columns = grid.columns[~grid.columns.isin(remove_colums)]

DEPTS = grid.dept_id.unique()

lags_columns = ['rm_7', 'rm_14', 'rm_30', 'rm_60', 'rm_180', 'max_7', 'std_7', 'max_14', 'std_14', 'max_30', 
       'std_30', 'max_60', 'std_60', 'max_180', 'std_180', 'diff_rm_7', 'diff_rm_56', 'diff_rm_140', 'lag_1',
       'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_zero_7', 'rolling_zero_56', 'rolling_zero_140']

for store in STORES:

       for dept in DEPTS:

              for step in STEPS:

                     grid = pd.read_pickle(r'data\Projects\df_feat_no_shift_2.pkl')
                     grid[['dept_id', 'store_id', 'id']] = grid[['dept_id', 'store_id', 'id']].astype('category')
                     grid = grid[(grid['store_id'] == store) & (grid['dept_id'] == dept)]
                     grid[lags_columns] = grid.groupby(['id'])[lags_columns].shift(step)
                     grid = grid.dropna()

                     val_start = first_val_day + step - VAL_DAYS
                     val_end = first_val_day + step - 1
                     pred_start = first_pred_day + step - VAL_DAYS 
                     pred_end = first_pred_day + step - 1

                     #print('pred_start: ', pred_start, 'pred_end: ', pred_end, 'val_start: ', val_start, 'val_end: ', val_end)

                     trainX = grid[(grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] <= train_end)][TARGET]

                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]

                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]

                     lgb_params = {
                            'boosting_type': 'gbdt',
                            'objective': 'tweedie',
                            'tweedie_variance_power': 1.1,
                            'metric': 'rmse',
                            'subsample': 0.5,
                            'device': 'gpu',
                            'subsample_freq': 1,
                            'learning_rate': 0.03,
                            'num_leaves': 2 ** 11 - 1,
                            'min_data_in_leaf': 2 ** 12 - 1,
                            'feature_fraction': 0.5,
                            'max_bin': 100,
                            'n_estimators': 1400,
                            'boost_from_average': False,
                            'verbose': -1
                            }

                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     lgbm.fit(trainX, trainY, eval_set=[(valX, valY)], eval_metric='rmse', early_stopping_rounds=50)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)

predictions.to_pickle('sub_7_models_alltrainingdata_dept&stores.pkl')
# Submission
submission = pd.read_csv(r'\data\sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
predictions.to_csv('submission.csv', index=False)

Stores Model - Multi-Step

In [None]:
STEPS = [4,8,12,16,20,24,28]
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
TARGET = ['sales']

horizon = 28
first_train_day = 1850
last_train_day = 1941 - horizon

first_val_day = last_train_day + 1
last_val_day = 1941

first_pred_day = 1941 + 1
train_days = last_train_day-first_train_day

train_start = first_train_day
train_end = last_train_day

predictions = pd.DataFrame()
remove_colums = ['item_id', 'store_id', 'state_id', 'id', 'd', 'date', 'weekday',
       'sales', 'year']

grid = pd.read_pickle(r'data\df_feat_no_shift_2.pkl')
train_columns = grid.columns[~grid.columns.isin(remove_colums)]

DEPTS = grid.dept_id.unique()

lags_columns = ['rm_7', 'rm_14', 'rm_30', 'rm_60', 'rm_180', 'max_7', 'std_7', 'max_14', 'std_14', 'max_30', 
       'std_30', 'max_60', 'std_60', 'max_180', 'std_180', 'diff_rm_7', 'diff_rm_56', 'diff_rm_140', 'lag_1',
       'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_zero_7', 'rolling_zero_56', 'rolling_zero_140']

for store in STORES:

       for step in STEPS:

              grid = pd.read_pickle(r'data\df_feat_no_shift_2.pkl')
              grid[['dept_id', 'store_id', 'id']] = grid[['dept_id', 'store_id', 'id']].astype('category')
              grid = grid[(grid['store_id'] == store) & (grid['dept_id'] == dept)]
              grid[lags_columns] = grid.groupby(['id'])[lags_columns].shift(step)
              grid = grid.dropna()

              val_start = first_val_day + step - VAL_DAYS
              val_end = first_val_day + step - 1
              pred_start = first_pred_day + step - VAL_DAYS 
              pred_end = first_pred_day + step - 1

              trainX = grid[(grid['d'] <= train_end)][train_columns]
              trainY = grid[(grid['d'] <= train_end)][TARGET]

              valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
              valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]

              testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]

              lgb_params = {
                     'boosting_type': 'gbdt',
                     'objective': 'tweedie',
                     'tweedie_variance_power': 1.1,
                     'metric': 'rmse',
                     'subsample': 0.5,
                     'device': 'gpu',
                     'subsample_freq': 1,
                     'learning_rate': 0.03,
                     'num_leaves': 2 ** 11 - 1,
                     'min_data_in_leaf': 2 ** 12 - 1,
                     'feature_fraction': 0.5,
                     'max_bin': 100,
                     'n_estimators': 1400,
                     'boost_from_average': False,
                     'verbose': -1
                     }

              # Train
              lgbm = LGBMRegressor(**lgb_params)
              lgbm.fit(trainX, trainY, eval_set=[(valX, valY)], eval_metric='rmse', early_stopping_rounds=50)

              # Predictions
              yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
              preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
              preds['sales'] = yhat
              predictions = pd.concat([predictions, preds], axis=0)

predictions.to_pickle('sub_14_models_alltrainingdata_stores.pkl')
# Submission
submission = pd.read_csv(r'\data\sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
predictions.to_csv('submission.csv', index=False)

Manual Ensemble

In [None]:
# pred1 = pd.read_pickle('store_dept_14_models.pkl')
# pred2 = pd.read_pickle('global_14_models.pkl')
# pred3 = pd.read_pickle('dept_14_models.pkl')
# pred4 = pd.read_pickle('stores_14_models.pkl')

# pred1 = pd.read_pickle('store_dept_4_models.pkl')
# pred2 = pd.read_pickle('global_4_models.pkl')
# pred3 = pd.read_pickle('dept_4_models.pkl')
# pred4 = pd.read_pickle('stores_4_models.pkl')

#pred1 = pd.read_pickle('sub_4_models_alltrainingdata_dept&stores.pkl')
#pred2 = pd.read_pickle('sub_4_models_alltrainingdata_stores.pkl')
#pred3 = pd.read_pickle('store_dept_14_models.pkl')
#pred4 = pd.read_pickle('stores_14_models.pkl')


"""
submission = pd.read_csv(r'\data\sample_submission.csv')
#results = pd.concat([pred1, pred2, pred3, pred4], axis=0)
results = pd.concat([pred1, pred3], axis=0)
results = results.groupby(['id', 'd']).mean().reset_index()
results = results.pivot(index='id', columns='d', values='sales').reset_index()
results.columns = submission.columns
results = submission[['id']].merge(results, on='id', how='left').fillna(1)
results.to_csv('submission_4.csv', index=False)
"""