In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import subprocess, psutil, os

from lightgbm.callback import early_stopping
from sklearn.preprocessing import LabelEncoder

def get_grid_name(grid):
    name =[x for x in globals() if globals()[x] is grid][0]
    return name

## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def reduce_mem_usage(grid, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = grid.memory_usage().sum() / 1024**2    
    for col in grid.columns:
        col_type = grid[col].dtypes
        if col_type in numerics:
            c_min = grid[col].min()
            c_max = grid[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    grid.loc[:,col] = grid[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    grid.loc[:,col] = grid[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    grid.loc[:,col] = grid[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    grid.loc[:,col] = grid[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    grid.loc[:,col] = grid[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    grid.loc[:,col] = grid[col].astype(np.float32)
                else:
                    grid.loc[:,col] = grid[col].astype(np.float64)    
    end_mem = grid.memory_usage().sum() / 1024**2
    if verbose: 
        print(' Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return grid

def submit_to_kaggle(competition_name, submission_file, message):
    kaggle_path = "/root/miniconda3/envs/lightgbm/bin/kaggle"
    subprocess.run([kaggle_path, "competitions", "submit", "-c", competition_name, "-f", submission_file, "-m", message])

## Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1    

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'device_type': 'cpu',
    'subsample_freq': 1,
    'min_child_weight': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'boost_from_average': False,
    'verbosity': -1
    }

In [None]:
if os.path.exists('recursive/no_feat.pkl'):
    print("The file 'no_feat.pkl' already exists. Skipping save operation.")

else:    
    TARGET = 'sales'         # Our main target
    END_TRAIN = 1941         # Last day in train set
    MAIN_INDEX = ['id','d']  # We can identify item by these columns

    eva = pd.read_csv('data/sales_train_evaluation.csv')
    print('Create Grid')
    index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
    grid = pd.melt(eva, 
                    id_vars = index_columns, 
                    var_name = 'd', 
                    value_name = TARGET)

    print(f'Train rows. Wide: {len(eva)}, Deep: {len(grid)}')

    add_grid = pd.DataFrame()
    for i in range(1,29):
        temp_df = eva[index_columns]
        temp_df = temp_df.drop_duplicates()
        temp_df['d'] = 'd_'+ str(END_TRAIN+i)
        temp_df[TARGET] = np.nan
        add_grid = pd.concat([add_grid,temp_df])

    grid = pd.concat([grid,add_grid])
    grid = grid.reset_index(drop=True)

    del temp_df, add_grid, eva
    print("{:>20}: {:>8}".format('Original grid',sizeof_fmt(grid.memory_usage(index=True).sum())))

    for col in index_columns:
        grid[col] = grid[col].astype('category')

    print("{:>20}: {:>8}".format('Reduced grid',sizeof_fmt(grid.memory_usage(index=True).sum())))
    grid = reduce_mem_usage(grid)

    price = pd.read_csv('data/sell_prices.csv')
    calendar = pd.read_csv('data/calendar.csv')
    print('Release week')

    release_df = price.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id','item_id','release']

    grid = merge_by_concat(grid, release_df, ['store_id','item_id'])
    del release_df

    grid = merge_by_concat(grid, calendar[['wm_yr_wk','d']], ['d'])
    grid = grid[grid['wm_yr_wk']>=grid['release']]
    grid = grid.reset_index(drop=True)
    grid = reduce_mem_usage(grid)

    grid = merge_by_concat(grid, price, ['store_id','item_id','wm_yr_wk'])
    grid = reduce_mem_usage(grid)
    print(grid.columns)
    del price, calendar
    grid['release'] = grid['release'] - grid['release'].min()
    grid['release'] = grid['release'].astype(np.int16)

    price = pd.read_pickle('data/prices.pkl')
    grid = grid.merge(price.drop(['sell_price'], axis=1), on = ['store_id','item_id','wm_yr_wk'], how='left')

    calendar = pd.read_csv('data/calendar.csv')
    grid = grid.merge(calendar.drop(['weekday','year','wday','month','wm_yr_wk'], axis=1), on = ['d'], how = 'left')

    le = LabelEncoder()
    cat_vars = ['item_id','store_id','dept_id','cat_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2']
    del price, calendar
    for cat in cat_vars:
        grid[cat] = le.fit_transform(grid[cat])

    grid['date'] = grid['date'].astype('datetime64[ns]')
    grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
    grid['tm_w'] = grid['date'].dt.isocalendar().week.astype(np.int8)
    grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
    grid['tm_y'] = grid['date'].dt.year
    grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
    grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
    grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)
    grid['d'] = grid['d'].str.replace('d_', '').astype('int16')
    grid = reduce_mem_usage(grid)
    grid.to_pickle('recursive/no_feat.pkl')
    del grid

In [2]:
def create_lag_features(grid):
    grid = grid[['id','d','sales']].copy()
    grp = grid.groupby(['id'], group_keys=False, observed=False)['sales']
    
    grid = reduce_mem_usage(grid)
    print('************ ROLLING LAGS ************')
    
    for roll in [7, 14, 30, 60]:
        grid.loc[:, f'rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
        grid.loc[:, f'diff_rm_{roll}'] = grp.transform(lambda x: x.diff().rolling(roll).mean())
        
    for s_window in [1, 7, 14]:
        for roll in [7, 14, 30, 60]:
            grid.loc[:, f'rm_shifted_{roll}'] = grp.transform(lambda x: x.shift(s_window).rolling(roll).mean())
            
    grid = reduce_mem_usage(grid)
    print('************ LAGS ************')
    
    for lag in np.arange(0, 15, 1):
        grid.loc[:, f'lag_{lag}'] = grp.transform(lambda x: x.shift(lag))
        
    grid = grid.drop(['sales'], axis=1)
    ix_to_drop = grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index
    grid.drop(index=ix_to_drop, inplace=True)
    grid = reduce_mem_usage(grid)
    return grid

### Training and saving models

In [None]:
horizon = 28
grid = pd.read_pickle('recursive/grid.pkl')
grid = grid[grid['d']>=1700]

TARGET = ['sales']
DAYS = [14,28]
step = DAYS[0]
STORES = grid.store_id.unique()
DEPTS = grid.dept_id.unique()
LAST_D = 1941

remove_colums = ['id','item_id','dept_id','cat_id','store_id','state_id','d','sales','wm_yr_wk','date']
lags_columns = feats.columns[len(remove_colums):]
train_columns = list(grid.columns[~grid.columns.isin(remove_colums)]) + list(lags_columns) 
print(train_columns)

grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
ix_to_drop = grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index
grid.drop(index=ix_to_drop, inplace=True)

for store in STORES:
    grid = pd.read_pickle('recursive/grid.pkl')
    grid = grid[grid['store_id']==store]
    trainX = grid[grid['d']<=1913][train_columns]
    trainY = grid[grid['d']<=1913][TARGET]
    valX = grid[(grid['d']>1913) & (grid['d']<=LAST_D)][train_columns]
    valY = grid[(grid['d']>1913) & (grid['d']<=LAST_D)][TARGET]
    train_set = lgb.Dataset(trainX, label=trainY)
    val_set = lgb.Dataset(valX, label=valY)

    callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]
    model = lgb.train(params=lgb_params,
                    train_set=train_set,
                    valid_sets=val_set,
                    num_boost_round=1400,
                    callbacks=callbacks)
    model.save_model(f'models/lgbm_{store}.txt')

### Predictions

In [3]:
horizon = 28
grid = pd.read_pickle('recursive/grid.pkl')
grid = grid[grid['d'] >=1750]

TARGET = ['sales']
DAYS = [14,28]
STORES = grid.store_id.unique()
DEPTS = grid.dept_id.unique()
step = DAYS[0]
last_day = 1941

remove_colums = ['id','item_id','dept_id','cat_id','store_id','state_id','d','sales','wm_yr_wk','date']
lags_columns = grid.columns[len(remove_colums):]
train_columns = list(grid.columns[~grid.columns.isin(remove_colums)]) + list(lags_columns) 
print(train_columns)

for day in DAYS:
    if day!=step:
        grid = create_lag_features(grid)

    grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
    ix_to_drop = grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index
    grid.drop(index=ix_to_drop, inplace=True)  
      
    for store in STORES:
        model = lgb.Booster(model_file=f'models/lgbm_{store}.txt')
        pred_msk = ( 
            (grid['d']>last_day+day-step) & 
            (grid['d']<=last_day+day) & 
            (grid['store_id']==store)
        ) 
        testX = grid.loc[pred_msk, train_columns]
        yhat = model.predict(testX).astype('float16')
        tmp_pred = grid.loc[testX.index][['id','d']]
        tmp_pred['sales'] = yhat
        grid.loc[testX.index, 'sales'] = yhat
        predictions = pd.concat([predictions, tmp_pred], axis=0)

predictions.to_pickle(f'submissions/recursive.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = f"Recursive step: {step}. Start training day: 1700"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

train columns: ['release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'prev_sell_price', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_dw', 'tm_w_end', 'rolling_zero_7', 'rm_7', 'std_7', 'diff_rm_7', 'rolling_zero_14', 'rm_14', 'std_14', 'diff_rm_14', 'rolling_zero_30', 'rm_30', 'std_30', 'diff_rm_30', 'rolling_zero_60', 'rm_60', 'std_60', 'diff_rm_60', 'lag_0', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'cat_id_enc', 'item_id_enc', 'dept_id_enc', 'store_id_enc', 'store_id_cat_id_enc', 'store_id_item_id_enc', 'store_id_dept_id_enc']
 Mem. usage decreased to  9.19 Mb (0.0% reduction)
************ ROLLING LAGS ************
 Mem. usage decreased to 81.81 Mb (0.0% reduction)
************ LAGS ************
 Mem. usage decreased to 19.15 Mb (0.0% reduction)
 Mem. usage decreased to 51.90 Mb (0.0%

KeyError: "['rolling_zero_7', 'std_7', 'rolling_zero_14', 'std_14', 'rolling_zero_30', 'std_30', 'rolling_zero_60', 'std_60'] not in index"

In [None]:
#  WORKS
def update_rolling_mean(grid, day, window_size, shift_size):
    # retrieval - the section required to calculate rms
    r_end = day - shift_size # 1)1941 2)1955
    r_start = r_end - shift_size - window_size
    r_mask = (grid['d'] > r_start) & (grid['d'] <= r_end) # 1920-1941
    grp = grid[r_mask].groupby('id', group_keys=False, observed=False)['sales']
    rolling_mean = grp.transform(lambda x: x.rolling(window_size).mean()).astype('float16')

    # values - filtering calculated rms down to the values we are updating
    v_end = day - shift_size
    v_start = v_end - shift_size
    v_mask = (grid['d'] > v_start) & (grid['d'] <= v_end)
    rolling_mean = rolling_mean.loc[v_mask]

    # update - where the values are going to be placed
    u_start = day - shift_size
    u_end = day
    u_mask = (grid['d'] > u_start) & (grid['d'] <= u_end)
    grid.loc[u_mask, f'rm_{window_size}'] = rolling_mean.values
    return grid

def update_rolling_difference(grid, day, window_size, shift_size):
    # retrieval - the section required to calculate rms
    # buffer = 200
    r_end = day - shift_size # 1941
    r_start = r_end - shift_size - window_size #-buffer #Adding an extra d for .diff() computation
    r_mask = (grid['d'] > r_start) & (grid['d'] <= r_end) 
    grp = grid[r_mask].groupby('id', group_keys=False, observed=False)['sales']
    rol_diff = grp.apply(lambda x: x.diff().rolling(window=window_size, 
                                                    min_periods=4).mean()).astype('float16')

    # values - filtering calculated rms down to the values we are updating
    v_end = day - shift_size
    v_start = v_end - shift_size
    v_mask = (grid['d'] > v_start) & (grid['d'] <= v_end)
    rol_diff = rol_diff.loc[v_mask]

    # update - where the values are going to be placed
    u_start = day - shift_size
    u_end = day
    u_mask = (grid['d'] > u_start) & (grid['d'] <= u_end)
    grid.loc[u_mask, f'diff_rm_{window_size}'] = rol_diff.values
    return grid

def update_lags(grid, day, lag_number, shift_size):

    r_end = day - shift_size 
    r_start = r_end - shift_size - lag_number
    r_mask = (grid['d'] > r_start) & (grid['d'] <= r_end) # 1920-1941
    grp = grid[r_mask].groupby('id', group_keys=False, observed=False)['sales']
    lag_values = grp.shift(lag_number)

    v_end = day - shift_size
    v_start = v_end - shift_size
    v_mask = (grid['d'] > v_start) & (grid['d'] <= v_end)
    lag_values = lag_values.loc[v_mask]

    u_start = day - shift_size
    u_end = day
    u_mask = (grid['d'] > u_start) & (grid['d'] <= u_end)
    grid.loc[u_mask, f'lag_{lag_number}'] = lag_values.values
    return grid

def update_rolling_stats(grid, day, window_size, shift_size):

    r_end = day - shift_size # 1)1941 2)1955
    r_start = r_end - shift_size - window_size
    r_mask = (grid['d'] > r_start) & (grid['d'] <= r_end) # 1920-1941
    grp = grid[r_mask].groupby('id', group_keys=False, observed=False)['sales']
    rol_std = grp.transform(lambda x: x.rolling(window_size).std()).astype('float16')
    rol_max = grp.transform(lambda x: x.rolling(window_size).max()).astype('float16')

    # values - filtering calculated rms down to the values we are updating
    v_end = day - shift_size
    v_start = v_end - shift_size
    v_mask = (grid['d'] > v_start) & (grid['d'] <= v_end)
    rol_std = rol_std.loc[v_mask]
    rol_max = rol_max.loc[v_mask]

    u_start = day - shift_size
    u_end = day
    u_mask = (grid['d'] > u_start) & (grid['d'] <= u_end)
    grid.loc[u_mask, f'std_{window_size}'] = rol_std.values
    grid.loc[u_mask, f'max_{window_size}'] = rol_max.values
    return grid

def update_rolling_zeroes(grid, day, window_size, shift_size):

    min_periods = int(window_size/3)
    r_end = day - shift_size # 1)1941 2)1955
    r_start = r_end - shift_size - window_size
    r_mask = (grid['d'] > r_start) & (grid['d'] <= r_end) # 1920-1941

    temp_zero = grid.loc[r_mask][['id','d','sales']]
    temp_zero['is_zero'] = [1 if sales == 0 else 0 for sales in temp_zero['sales']]
    grp = temp_zero.groupby(['id'], group_keys=False, observed=False)['sales']
    rol_zero = grp.transform(lambda x: x.rolling(window=window_size,
                                                 min_periods=min_periods).sum()).astype('float16')
    
    v_end = day - shift_size
    v_start = v_end - shift_size
    v_mask = (grid['d'] > v_start) & (grid['d'] <= v_end)
    rol_zero = rol_zero.loc[v_mask]    
    
    u_start = day - shift_size
    u_end = day
    u_mask = (grid['d'] > u_start) & (grid['d'] <= u_end)
    grid.loc[u_mask, f'rolling_zero_{window_size}'] = rol_zero.values