# Data download
We have done feature engineering and saved the required data in google drive so that we will not have to repeat these steps again, and we use this data to run on google colab to test different models., also we are serializing the data frame directly using pickle instead of csv so that we donot need to define data types each time again and we can leverage the optimized data frames for algorithms

In [1]:
#https://drive.google.com/file/d/1ccvbBiFHhlC9ZNLOBllGYBlMgxtUJ3T3/view?usp=drive_link
import gdown

# Google Drive file ID
file_id = '1ccvbBiFHhlC9ZNLOBllGYBlMgxtUJ3T3'

# Destination file path where you want to save the downloaded file
output_path = 'downloaded_file.zip'

# Download the file using gdown
url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(url, output_path, quiet=False)

print(f"File '{output_path}' has been downloaded.")


Downloading...
From: https://drive.google.com/uc?id=1ccvbBiFHhlC9ZNLOBllGYBlMgxtUJ3T3
To: /content/downloaded_file.zip
100%|██████████| 2.20G/2.20G [00:23<00:00, 93.0MB/s]


File 'downloaded_file.zip' has been downloaded.


In [2]:
! unzip downloaded_file.zip

Archive:  downloaded_file.zip
replace input_data/enc_feats.pkl? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Below is the path where files are saved and model feeds on file here to make predictions

In [1]:
files_path = r'/content/input_data/'

Inspired by popular solutions on kaggle, below code downloads necessary data sets from kaggle to google colab environment

### Utils and helpers for data pre processing and feature engineering

In [2]:
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import subprocess, psutil, os

## Below is the regression libarary being used

import lightgbm as lgb
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping

# since here x is in 2 dimensions we are leveraging tweedie distributions (we have noticed these features among best kaggle scores)

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'min_child_weight': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400,
    'boost_from_average': False,
    'verbosity': -1,
    #'device': 'auto'
}
lgbm = LGBMRegressor(**lgb_params)
callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

In [3]:
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def reduce_mem_usage(dataframe, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory = dataframe.memory_usage().sum() / 1024**2
    for column_selected in dataframe.columns:
        column_type = dataframe[column_selected].dtypes
        if column_type in numerics:
            column_min = dataframe[column_selected].min()
            column_max = dataframe[column_selected].max()
            if str(column_type)[:3] == 'int':
                if column_min > np.iinfo(np.int8).min and column_max < np.iinfo(np.int8).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.int8)
                elif column_min > np.iinfo(np.int16).min and column_max < np.iinfo(np.int16).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.int16)
                elif column_min > np.iinfo(np.int32).min and column_max < np.iinfo(np.int32).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.int32)
                elif column_min > np.iinfo(np.int64).min and column_max < np.iinfo(np.int64).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.int64)
            else:
                if column_min > np.finfo(np.float16).min and column_max < np.finfo(np.float16).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.float16)
                elif column_min > np.finfo(np.float32).min and column_max < np.finfo(np.float32).max:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.float32)
                else:
                    dataframe[column_selected] = dataframe[column_selected].astype(np.float64)
    end_memory = dataframe.memory_usage().sum() / 1024**2
    if verbose:
        print(' Memory usage decreased by ({:.1f}% reduction)'.format(100 * (start_memory - end_memory) / start_memory))
    return dataframe

## Merging by concat to not lose optimized dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1






### Prices features

so we are doing feature engineering here to get price related data, we have week wise data of price (we have price features for test weeks as well).

We are using expanding max price , minimum price , standard deviation , mean, so that there is no data leakage from future to past, and ,model can solely use the past data.

(since the data is already sorted time wise we are not sorting again , saves in computation time).

In [4]:
if os.path.exists(f'{files_path}prices.pkl'):
    print("The file 'prices.pkl' already exists. Skipping save operation.")

else:
    print('Creating Price Features, adding max price, min price, std, mean, prev price')
    price = pd.read_csv(f'{files_path}sell_prices.csv')
    calendar = pd.read_csv(f'{files_path}calendar.csv')
    calendar = reduce_mem_usage(calendar)
    price = reduce_mem_usage(price)

    print(f'Price Stats')
    grp = price.groupby(['store_id','item_id'])['sell_price']
    price['price_max'] = grp.transform(lambda x: x.expanding().max()).reset_index(drop=True)
    price['price_min'] = grp.transform(lambda x: x.expanding().min()).reset_index(drop=True)
    price['price_std'] = grp.transform(lambda x: x.expanding().std()).reset_index(drop=True)
    price['price_mean'] = grp.transform(lambda x: x.expanding().mean()).reset_index(drop=True)
    price['prev_sell_price'] = grp.transform(lambda x: x.shift(1))
    del grp, calendar
    price = reduce_mem_usage(price)
    print(price.columns)
    price.to_pickle(f'{files_path}prices.pkl')
    del price

The file 'prices.pkl' already exists. Skipping save operation.


### Base DataFrame

We have data in wide format , but since we are leveraging on regression algorithms have modified to use data in long format, add rows for test data as well.

In [5]:
if os.path.exists(f'{files_path}base.pkl'):
    print("The file 'base.pkl' already exists. Skipping save operation.")

else:
    print(f'Transforming from wide to deep format. Some data types converstion for memory management')
    TARGET = 'sales'
    END_TRAIN = 1941
    MAIN_INDEX = ['id','d']

    eva = pd.read_csv(f'{files_path}sales_train_evaluation.csv')
    print('Create Grid')
    index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
    grid = pd.melt(eva,
                      id_vars = index_columns,
                      var_name = 'd',
                      value_name = TARGET)

    print(f'Train rows. Wide: {len(eva)}, Deep: {len(grid)}')

    add_grid = pd.DataFrame()
    for i in range(1,29):
        temp_df = eva[index_columns]
        temp_df = temp_df.drop_duplicates()
        temp_df['d'] = 'd_'+ str(END_TRAIN+i)
        temp_df[TARGET] = np.nan
        add_grid = pd.concat([add_grid,temp_df])

    grid = pd.concat([grid,add_grid])
    grid = grid.reset_index(drop=True)

    del temp_df, add_grid, eva
    print("{}: {}".format('Original grid',sizeof_fmt(grid.memory_usage(index=True).sum())))

    for col in index_columns:
        grid[col] = grid[col].astype('category')

    print("{}: {}".format('Reduced grid',sizeof_fmt(grid.memory_usage(index=True).sum())))
    grid = reduce_mem_usage(grid)
    grid.to_pickle(f'{files_path}base.pkl')
    del grid

The file 'base.pkl' already exists. Skipping save operation.


### Calendar Features

we see prices of some items starting for a particular week, which might indicate that would be release week for the product so we can use data in base data frame after that point (as since earlier data was in long format it would have data for all items through all days)
 This reduces size of the data and will have feature of when the product was released (capturing any trends if item get sold when we are predicting for volumes closer to release dates)

Then we do label encoding of the categorical features so that they can be used for regression algorithms


In [6]:
if os.path.exists(f'{files_path}no_feat.pkl'):
    print("The file 'no_feat.pkl' already exists. Skipping save operation.")

else:
    print('Creating release week and removing rows before price data exists. Adding price features.')
    price = pd.read_csv(f'{files_path}sell_prices.csv')
    calendar = pd.read_csv(f'{files_path}calendar.csv')

    release_df = price.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id','item_id','release']

    grid = pd.read_pickle(f'{files_path}base.pkl')
    grid = merge_by_concat(grid, release_df, ['store_id','item_id'])
    del release_df

    grid = merge_by_concat(grid, calendar[['wm_yr_wk','d']], ['d'])
    grid = grid[grid['wm_yr_wk']>=grid['release']]
    grid = grid.reset_index(drop=True)
    grid = reduce_mem_usage(grid)

    grid = merge_by_concat(grid, price, ['store_id','item_id','wm_yr_wk'])
    grid = reduce_mem_usage(grid)
    print(grid.columns)
    del price, calendar
    grid['release'] = grid['release'] - grid['release'].min()
    grid['release'] = grid['release'].astype(np.int16)

    price = pd.read_pickle(f'{files_path}prices.pkl')
    grid = grid.merge(price.drop(['sell_price'], axis=1), on = ['store_id','item_id','wm_yr_wk'], how='left')

    calendar = pd.read_csv(f'{files_path}calendar.csv')
    grid = grid.merge(calendar.drop(['weekday','year','wday','month','wm_yr_wk'], axis=1), on = ['d'], how = 'left')

    le = LabelEncoder()
    cat_vars = ['item_id','store_id','dept_id','cat_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2']
    del price, calendar
    for cat in cat_vars:
        grid[cat] = le.fit_transform(grid[cat])

    grid['date'] = grid['date'].astype('datetime64[ns]')
    grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
    grid['tm_w'] = grid['date'].dt.isocalendar().week.astype(np.int8)
    grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
    grid['tm_y'] = grid['date'].dt.year
    grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
    grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
    grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)
    grid['d'] = grid['d'].str.replace('d_', '').astype('int16')
    grid = reduce_mem_usage(grid)
    grid.to_pickle(f'{files_path}no_feat.pkl')
    del grid

The file 'no_feat.pkl' already exists. Skipping save operation.


### Lag and rolling lags features

Another important feature we observed in winning solutions is they used lags data and roll data in feature engineering, this kind of gives how trends data could be captured using regression algorithm , though we are not specifically using time series data.

for this we have considered rolling sum of number of times, 0 units of product were sold, 7, 14, 30, 60, 180 days of roll (week, 2 weeks, approx month, 2 months approx, approx half year), with this we will be able to acpture trend details.

as next important features we have chosen lag features (these will capture sales with a lag of that manay days we have in feature.

In [7]:

if os.path.exists(f'{files_path}lags.pkl'):
    print("The file 'lags.pkl' already exists. Skipping save operation.")

else:
    print(f'Creating a variety lag features')
    grid = pd.read_pickle(f'{files_path}no_feat.pkl')
    grid = grid[['id','d','sales']]

    zero_grid = grid.loc[:,['id','sales']]
    zero_grid['is_zero'] = (grid['sales'] == 0).astype(int)
    zero_grid = zero_grid.drop(['sales'], axis=1)
    grid['is_zero'] = zero_grid['is_zero']

    grp = grid.groupby(['id'], group_keys=False, observed=False)['sales']
    grp_z = grid.groupby(['id'], group_keys=False, observed=False)['is_zero']

    grid = reduce_mem_usage(grid)
    print('************ ROLLING LAGS ************')
    for roll in [7, 14, 30, 60, 180]:
        grid[f'rolling_zero_{roll}'] = grp_z.transform(lambda x: x.rolling(roll).sum())
        grid[f'rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
        grid[f'std_{roll}'] = grp.transform(lambda x: x.rolling(roll).std())
        grid[f'diff_rm_{roll}'] = grp.transform(lambda x : x.diff().rolling(roll).mean())
        grid[f'max_{roll}'] = grp.transform(lambda x: x.rolling(roll).max())
        grid = reduce_mem_usage(grid)

    del zero_grid

    grid = reduce_mem_usage(grid)
    print('************ LAGS ************')
    for lag in np.arange(0, 15, 1):
        grid[f'lag_{lag}'] = grp.transform(lambda x: x.shift(lag))

    grid = grid.drop(['is_zero', 'sales'], axis = 1)
    ix_to_drop = grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index
    grid.drop(index=ix_to_drop, inplace=True)
    grid = reduce_mem_usage(grid)
    grid.to_pickle(f'{files_path}lags.pkl')
    del grid

The file 'lags.pkl' already exists. Skipping save operation.


### Categorical Encodings

We then use category wise sales data, item wise sales data, department wise sales data (across all stores), then also use store and category wise sales data, store and item wise sales data, store and department wise sales data.

This gives cross sectional features that our model could pick if there is any trend.

In [8]:
# LAGs DATAFRAME - Only needs to be run once.
if os.path.exists(f'{files_path}enc_feats.pkl'):
    print("The file 'enc_feats.pkl' already exists. Skipping save operation.")

else:
    print(f'Creating categorical feature encodings')
    grid = pd.read_pickle(f'{files_path}no_feat.pkl')
    grid = grid[['id','d','sales','item_id','dept_id','cat_id','store_id','state_id']]
    grid = reduce_mem_usage(grid)
    for col_name in ['cat_id', 'item_id', 'dept_id', 'store_id', 'store_id,cat_id', 'store_id,item_id', 'store_id,dept_id']:
        col_names = col_name.split(',')
        s_col_name = col_name.replace(',', '_')
        grid[f'{s_col_name}_enc'] = grid.groupby(col_names, observed=False)['sales'].transform(lambda x: x.expanding().mean())

    print('************ CATEGORIES ENCODED ************')
    # Memory reduction
    grid = grid.drop(['sales','item_id','dept_id','cat_id','store_id','state_id'], axis=1)
    grid = reduce_mem_usage(grid)
    grid.to_pickle(f'{files_path}enc_feats.pkl')
    del grid

The file 'enc_feats.pkl' already exists. Skipping save operation.


### Set up

### Train and Predict

First we need to model comparision to see which model produces better kaggle score and use that model , then optimize the step size so as to improve the score further.

through this process we are basically using the sales data that we have on t- step (here for 1- 14 days prediction step  will be 14, and from 14 till 28th day step will be 28 days)

Then we run prediction model where we first loop over store and department to train the model (slowe) , next over store and category (will be quicker) and take average of both the methods to arrive at final submission


In [9]:
horizon = 28
base = pd.read_pickle(f'{files_path}no_feat.pkl')

STEPS = [14, 28]
TARGET = ['sales']
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
STORES = base.store_id.unique()
DEPTS = base.dept_id.unique()
CATS = base.cat_id.unique()
ITEMS = base.item_id.unique()

train_start = 1
train_end = 1941 - horizon
first_val_day = train_end + 1
last_val_day = 1941
first_pred_day = 1941 + 1
val_start = 1942-28
val_end = 1941

feats = pd.read_pickle(f'{files_path}lags.pkl')
enc_feats = pd.read_pickle(f'{files_path}enc_feats.pkl')

remove_colums = ['id','item_id','dept_id','cat_id','store_id','state_id','d','sales','wm_yr_wk','date']
enc_columns = enc_feats.columns[2:]
lags_columns = list(feats.columns[2:]) + list(enc_columns)
train_columns = list(base.columns[~base.columns.isin(remove_colums)]) + list(lags_columns)

del base, feats, enc_feats

In [None]:
predictions = pd.DataFrame()

for store in STORES:
    print(f'************ Training Store {store+1} ************')
    for dept in DEPTS:
          print(f'************ department {dept} ************')
          for step in STEPS:
                base = pd.read_pickle(f'{files_path}no_feat.pkl')
                base = base[(base['dept_id']==dept) & (base['store_id']==store)]
                store_ids = base.id.unique()
                feats = pd.read_pickle(f'{files_path}lags.pkl')
                feats = feats[feats['id'].isin(store_ids)]
                enc_feats = pd.read_pickle(f'{files_path}enc_feats.pkl')
                enc_feats = enc_feats[enc_feats['id'].isin(store_ids)]
                grid = base.merge(feats, on=['id', 'd'], how='left')
                del feats, base
                grid = grid.merge(enc_feats, on=['id', 'd'])
                del enc_feats
                grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                ix_to_drop = grid[(grid['d'] <= 1941) & grid.isna().any(axis=1)].index
                grid.drop(index=ix_to_drop, inplace=True)

                pred_start = first_pred_day + step - VAL_DAYS
                pred_end = first_pred_day + step - 1
                print(f'Val start: {val_start}. Val end {val_end}. Pred start {pred_start} Pred end {pred_end}')
                trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                print(f'Train shape: {trainX.shape}. Val shape: {valX.shape}. Test shape: {testX.shape}')

                # Train
                lgbm.fit(trainX, trainY,
                        eval_set=[(valX, valY)],
                        eval_metric='rmse',
                        callbacks=callbacks)

                 # Predict
                yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                preds['sales'] = yhat
                predictions = pd.concat([predictions, preds], axis=0)

predictions.to_pickle(f'{files_path}store_depts_preds.pkl')
del predictions

************ Training Store 1 ************
************ department 3 ************
Val start: 1914. Val end 1941. Pred start 1942 Pred end 1955
Train shape: (571079, 67). Val shape: (11648, 67). Test shape: (5824, 67)
Training until validation scores don't improve for 50 rounds


In [None]:
predictions = pd.DataFrame()

for store in STORES:
    print(f'************ Training Store {store+1} ************')
    for cat in CATS:
          print(f'************ category {cat} ************')
          for step in STEPS:
                base = pd.read_pickle(f'{files_path}no_feat.pkl')
                base = base[(base['cat_id']==cat) & (base['store_id']==store)]
                store_ids = base.id.unique()
                feats = pd.read_pickle(f'{files_path}lags.pkl')
                feats = feats[feats['id'].isin(store_ids)]
                enc_feats = pd.read_pickle(f'{files_path}enc_feats.pkl')
                enc_feats = enc_feats[enc_feats['id'].isin(store_ids)]
                grid = base.merge(feats, on=['id', 'd'], how='left')
                del feats, base
                grid = grid.merge(enc_feats, on=['id', 'd'])
                del enc_feats
                grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                ix_to_drop = grid[(grid['d'] <= 1941) & grid.isna().any(axis=1)].index
                grid.drop(index=ix_to_drop, inplace=True)

                pred_start = first_pred_day + step - VAL_DAYS
                pred_end = first_pred_day + step - 1
                print(f'Val start: {val_start}. Val end {val_end}. Pred start {pred_start} Pred end {pred_end}')
                trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                print(f'Train shape: {trainX.shape}. Val shape: {valX.shape}. Test shape: {testX.shape}')


                #training
                lgbm.fit(trainX, trainY,
                        eval_set=[(valX, valY)],
                        eval_metric='rmse',
                        callbacks=callbacks)

                # Predict
                yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                preds['sales'] = yhat
                predictions = pd.concat([predictions, preds], axis=0)

predictions.to_pickle(f'{files_path}store_cats_preds.pkl')
del predictions

### Submission

In [None]:
pred1 = pd.read_pickle(f'{files_path}store_depts_preds.pkl')
pred2 = pd.read_pickle(f'{files_path}store_cats_preds.pkl')


pred1.set_index(['id', 'd'], inplace=True)
pred2.set_index(['id', 'd'], inplace=True)

df_avg = (pred1 + pred2) / 2
df_avg.reset_index(inplace=True)

submission = pd.read_csv(f'{files_path}sample_submission.csv')
df_avg = df_avg.pivot(index='id', columns='d', values='sales').reset_index()
df_avg.columns = submission.columns
df_avg = submission[['id']].merge(df_avg, on='id', how='left').fillna(1)
print(df_avg)
submission_file = f"{files_path}submission_svr.csv"
df_avg.to_csv(f'{submission_file}', index=False)
