In [1]:
import numpy as np
import pandas as pd
import os
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import pickle
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    if type(merge_on) != list:
        merge_on = [merge_on]
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

In [3]:
INPUT_DIR = './data/'
cal = pd.read_csv(os.path.join(INPUT_DIR,'calendar.csv'))
ste = pd.read_csv(os.path.join(INPUT_DIR,'sales_train_evaluation.csv'))
sellp = pd.read_csv(os.path.join(INPUT_DIR,'sell_prices.csv'))

In [4]:
# add nan sales for dates d_1942 to d_1969
for i in range(1942,1970):
    col = "d_"+ str(i)
    ste[col] = np.nan

In [5]:
# convert sales from column to row
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
process_df = pd.melt(ste, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = 'sales')

for col in index_columns:
    process_df[col] = process_df[col].astype('category')

In [6]:
cal_cols = ['date',
            'wm_yr_wk',
            'weekday',
            'month',
            'year',
            'd',
            'event_name_1',
            'event_type_1',
            'event_name_2',
            'event_type_2',
            'snap_CA',
            'snap_TX',
            'snap_WI']

process_df = merge_by_concat(process_df, cal[cal_cols], ['d'])

cal_cols = ['weekday',
            'd',
            'event_name_1',
            'event_type_1',
            'event_name_2',
            'event_type_2',
            'snap_CA',
            'snap_TX',
            'snap_WI']

for col in cal_cols:
    process_df[col] = process_df[col].astype('category')

In [7]:
# feature engineering
# date features
process_df['date'] = pd.to_datetime(process_df['date'])

process_df['day'] = process_df['date'].dt.day.astype(np.int8)
process_df['week_in_month'] = process_df['date'].dt.week.astype(np.int8)
process_df['is_weekend'] = (process_df['weekday'].apply(lambda x: x in set(['Saturday','Sunday']))).astype(bool)

process_df = merge_by_concat(process_df, sellp, ['item_id', 'wm_yr_wk', 'store_id'])

process_df['d'] = process_df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)

In [8]:
# feature engineering
# lag features
for size in tqdm(range(28,28+15,3)):
    process_df[f"sales_{size}_lag"] = process_df.groupby(by="id")['sales'].transform(lambda x: x.shift(size)).astype(np.float16)

# shifted moving averages
for size in tqdm([7, 30, 60, 90, 180]):
    process_df[f"sales_{size}_rolling_mean"] = process_df.groupby(by="id")['sales'].transform(lambda x: x.shift(28).rolling(size).mean()).astype(np.float16)
    process_df[f"sales_{size}_rolling_std"] = process_df.groupby(by="id")['sales'].transform(lambda x: x.shift(28).rolling(size).std()).astype(np.float16)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
# memory reduction
process_df = reduce_mem_usage(process_df)
process_df.to_pickle('processed.pkl')

  0%|          | 0/39 [00:00<?, ?it/s]

Mem. usage decreased to 3894.84 Mb (31.3% reduction)
