In [1]:
import pandas as pd
import numpy as np
import subprocess, psutil, os

from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping
from sklearn.preprocessing import LabelEncoder

def get_grid_name(grid):
    name =[x for x in globals() if globals()[x] is grid][0]
    return name

## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def reduce_mem_usage(grid, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = grid.memory_usage().sum() / 1024**2    
    for col in grid.columns:
        col_type = grid[col].dtypes
        if col_type in numerics:
            c_min = grid[col].min()
            c_max = grid[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    grid[col] = grid[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    grid[col] = grid[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    grid[col] = grid[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    grid[col] = grid[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    grid[col] = grid[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    grid[col] = grid[col].astype(np.float32)
                else:
                    grid[col] = grid[col].astype(np.float64)    
    end_mem = grid.memory_usage().sum() / 1024**2
    if verbose: 
        print(' Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return grid

def submit_to_kaggle(competition_name, submission_file, message):
    kaggle_path = "/root/miniconda3/envs/lightgbm/bin/kaggle"
    subprocess.run([kaggle_path, "competitions", "submit", "-c", competition_name, "-f", submission_file, "-m", message])

## Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1    

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'device_type': 'cpu',
    'subsample_freq': 1,
    'min_child_weight': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400, #1400
    'boost_from_average': False,
    'verbosity': -1
    }

TARGET = 'sales'         # Our main target
END_TRAIN = 1941         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [None]:
# Price & Calendar features

TARGET = 'sales'         # Our main target
END_TRAIN = 1941         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

eva = pd.read_csv('data/sales_train_evaluation.csv')
print('Create Grid')
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
grid = pd.melt(eva, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

print(f'Train rows. Wide: {len(eva)}, Deep: {len(grid)}')

add_grid = pd.DataFrame()
for i in range(1,29):
    temp_df = eva[index_columns]
    temp_df = temp_df.drop_duplicates()
    temp_df['d'] = 'd_'+ str(END_TRAIN+i)
    temp_df[TARGET] = np.nan
    add_grid = pd.concat([add_grid,temp_df])

grid = pd.concat([grid,add_grid])
grid = grid.reset_index(drop=True)

del temp_df, add_grid, eva
print("{:>20}: {:>8}".format('Original grid',sizeof_fmt(grid.memory_usage(index=True).sum())))

for col in index_columns:
    grid[col] = grid[col].astype('category')

print("{:>20}: {:>8}".format('Reduced grid',sizeof_fmt(grid.memory_usage(index=True).sum())))
grid = reduce_mem_usage(grid)

price = pd.read_csv('data/sell_prices.csv')
calendar = pd.read_csv('data/calendar.csv')
print('Release week')

release_df = price.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = ['store_id','item_id','release']

grid = merge_by_concat(grid, release_df, ['store_id','item_id'])
del release_df

grid = merge_by_concat(grid, calendar[['wm_yr_wk','d']], ['d'])
grid = grid[grid['wm_yr_wk']>=grid['release']]
grid = grid.reset_index(drop=True)
grid = reduce_mem_usage(grid)

grid = merge_by_concat(grid, price, ['store_id','item_id','wm_yr_wk'])
grid = reduce_mem_usage(grid)
print(grid.columns)
del price, calendar
grid['release'] = grid['release'] - grid['release'].min()
grid['release'] = grid['release'].astype(np.int16)

price = pd.read_pickle('data/prices.pkl')
grid = grid.merge(price.drop(['sell_price'], axis=1), on = ['store_id','item_id','wm_yr_wk'], how='left')

calendar = pd.read_csv('data/calendar.csv')
grid = grid.merge(calendar.drop(['weekday','year','wday','month','wm_yr_wk'], axis=1), on = ['d'], how = 'left')

le = LabelEncoder()
cat_vars = ['item_id','store_id','dept_id','cat_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2']
del price, calendar
for cat in cat_vars:
    grid[cat] = le.fit_transform(grid[cat])

grid['date'] = grid['date'].astype('datetime64[ns]')
grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
grid['tm_w'] = grid['date'].dt.isocalendar().week.astype(np.int8)
grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
grid['tm_y'] = grid['date'].dt.year
grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)
grid['d'] = grid['d'].str.replace('d_', '').astype('int16')
grid = reduce_mem_usage(grid)
grid.to_pickle('itermediate_dfs/no_feat.pkl')
del grid

In [None]:
# Rolling Features
grid = pd.read_pickle('itermediate_dfs/no_feat.pkl')
grid = grid[['id','d','sales']]

zero_grid = grid.loc[:,['id','sales']]
zero_grid['is_zero'] = (grid['sales'] == 0).astype(int)
zero_grid = zero_grid.drop(['sales'], axis=1)
grid['is_zero'] = zero_grid['is_zero']

grp = grid.groupby(['id'], group_keys=False, observed=False)['sales']
grp_z = grid.groupby(['id'], group_keys=False, observed=False)['is_zero']

grid = reduce_mem_usage(grid)
print('************ ROLLING LAGS ************')
for roll in [7, 14, 30, 60, 180]:
    grid[f'rolling_zero_{roll}'] = grp_z.transform(lambda x: x.rolling(roll).sum())
    grid[f'rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
    grid[f'std_{roll}'] = grp.transform(lambda x: x.rolling(roll).std())
    grid[f'diff_rm_{roll}'] = grp.transform(lambda x : x.diff().rolling(roll).mean()) 
    grid[f'max_{roll}'] = grp.transform(lambda x: x.rolling(roll).max())
    grid = reduce_mem_usage(grid)
del zero_grid

grid = reduce_mem_usage(grid)
print('************ LAGS ************')
for lag in np.arange(0, 15, 1):
    grid[f'lag_{lag}'] = grp.transform(lambda x: x.shift(lag))

grid = grid.drop(['is_zero', 'sales'], axis = 1)
ix_to_drop = grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index
grid.drop(index=ix_to_drop, inplace=True)
grid = reduce_mem_usage(grid)
grid.to_pickle('itermediate_dfs/lags.pkl')
del grid

In [None]:
# Mean Categorical encoding 

grid = pd.read_pickle('itermediate_dfs/no_feat.pkl')
grid = grid[['id','d','sales','item_id','dept_id','cat_id','store_id','state_id']]
grid = reduce_mem_usage(grid)
for col_name in ['cat_id', 'item_id', 'dept_id', 'store_id', 'store_id,cat_id', 'store_id,item_id', 'store_id,dept_id']:
    col_names = col_name.split(',')
    s_col_name = col_name.replace(',', '_')
    grid[f'{s_col_name}_enc'] = grid.groupby(col_names, observed=False)['sales'].transform(lambda x: x.expanding().mean())


print('************ CATEGORIES ENCODED ************')
# Memory reduction
grid = grid.drop(['sales','item_id','dept_id','cat_id','store_id','state_id'], axis=1)
grid = reduce_mem_usage(grid)
grid.to_pickle('itermediate_dfs/enc_feats.pkl')
del grid

In [None]:
# Categories Rolling means

grid = pd.read_pickle('itermediate_dfs/no_feat.pkl')
grid = grid[['id','d','store_id','item_id','dept_id','sales','cat_id']]

# For item-level features
item_feats = grid.groupby(['d', 'item_id'])['sales'].mean().reset_index()
grp = item_feats.groupby(['item_id'], observed=False)['sales']
for roll in [7, 30, 60]:
    item_feats[f'item_rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
item_feats = item_feats.drop(['sales'], axis=1).dropna().reset_index(drop=True)
print(f'Items Completed')

grid = grid.merge(item_feats, on=['item_id','d'], how='left')
grid = grid.drop(['store_id','dept_id','item_id','cat_id','sales'], axis=1)
grid.dropna(inplace=True)
grid = reduce_mem_usage(grid)
grid.to_pickle(f'itermediate_dfs/dimension_feats.pkl')

""" 
dept_feats = grid.groupby(['d', 'dept_id'])['sales'].mean().reset_index()
grp = dept_feats.groupby(['dept_id'], observed=False)['sales']
for roll in [7,30,60]:
    dept_feats[f'dept_rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
dept_feats = dept_feats.drop(['sales'], axis=1).dropna().reset_index(drop=True)
print(f'Depts Completed')

# For item-level features
cat_feats = grid.groupby(['d', 'cat_id'])['sales'].mean().reset_index()
grp = cat_feats.groupby(['cat_id'], observed=False)['sales']
for roll in [7, 30, 60]:
    cat_feats[f'cat_rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
cat_feats = cat_feats.drop(['sales'], axis=1).dropna().reset_index(drop=True)
print(f'Categories Completed')

# For store-level features
store_feats = grid.groupby(['d', 'store_id'])['sales'].mean().reset_index()
grp = store_feats.groupby(['store_id'], observed=False)['sales']
for roll in [7, 30, 60]:
    store_feats[f'store_rm_{roll}'] = grp.transform(lambda x: x.rolling(roll).mean())
store_feats = store_feats.drop(['sales'], axis=1).dropna().reset_index(drop=True)
print(f'Stores Completed')

grid = reduce_mem_usage(grid)
grid = grid.merge(dept_feats, on=['dept_id','d'], how='left')
grid = grid.merge(store_feats, on=['store_id','d'], how='left')
grid = grid.merge(cat_feats, on=['cat_id','d'], how='left')
 """

In [None]:
# On promotion features
bare = pd.read_pickle('itermediate_dfs/bare_cal_price.pkl')
bare = bare.drop(['sales','dept_id','state_id', 'cat_id', 'release', 'd'], axis=1).drop_duplicates()
price = pd.read_pickle('data/prices.pkl')
price['on_promotion'] = price['sell_price'] < (price['price_mean']-price['price_std']*2)
price['ten_prc_promo'] = price['sell_price'] < 0.9 * price['price_mean']
price['prev_wk_promo'] = price['sell_price'] < 0.9 * price['prev_sell_price']
price = price.drop(['sell_price','price_max','price_min','price_std','price_mean','prev_sell_price'], axis=1)
price = price.merge(bare, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
price = price.drop(['store_id','item_id'], axis=1)
new_order = ['id', 'wm_yr_wk', 'on_promotion', 'ten_prc_promo', 'prev_wk_promo']
price = price[new_order]
price.to_pickle('itermediate_dfs/promo.pkl')
print(f'**** On-promotion features added ****')
price.to_pickle('itermediate_dfs/promo.pkl')
del price, bare

In [2]:
horizon = 28
base = pd.read_pickle('itermediate_dfs/no_feat.pkl')

STEPS = list(np.arange(2,30,2))
TARGET = ['sales']
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
STORES = base.store_id.unique()
DEPTS = base.dept_id.unique()
CATS = base.cat_id.unique()

train_start = 1000
train_end = 1941-28
val_start = 1942-28
val_end = 1941
first_pred_day = 1942

feats = pd.read_pickle('itermediate_dfs/lags.pkl')
enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')

remove_colums = ['id','item_id','dept_id','cat_id','store_id','state_id','d','sales','wm_yr_wk','date']
enc_columns = enc_feats.columns[2:]
lags_columns = list(feats.columns[2:]) + list(enc_columns)
train_columns = list(base.columns[~base.columns.isin(remove_colums)]) + list(lags_columns)
del base, feats

In [3]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for step in STEPS:
              base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
              feats = pd.read_pickle('itermediate_dfs/lags.pkl')
              enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
              base = base[(base['store_id']==store)]
              feats = feats[feats['id'].isin(base.id.unique())]
              enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
              grid = base.merge(feats, on=['id', 'd'], how='left')
              grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
              del enc_feats,feats,base

              grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
              grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
              
              pred_start = val_end + step - VAL_DAYS + 1
              pred_end = val_end + step 
              print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
              trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
              trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
              valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
              valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
              testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                            
              # Train
              lgbm = LGBMRegressor(**lgb_params)
              callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

              lgbm.fit(trainX, trainY,
                     eval_set=[(valX, valY)],
                     eval_metric='rmse',
                     callbacks=callbacks)

              # Predict
              yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
              preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
              preds['sales'] = yhat
              predictions = pd.concat([predictions, preds], axis=0)

# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_stores.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Stores only. Step14. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[387]	valid_0's rmse: 2.01758
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds


In [None]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for dept in DEPTS:
              for step in STEPS:
                     base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
                     feats = pd.read_pickle('itermediate_dfs/lags.pkl')
                     enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
                     base = base[(base['store_id']==store) & (base['dept_id']==dept)]
                     feats = feats[feats['id'].isin(base.id.unique())]
                     enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
                     grid = base.merge(feats, on=['id', 'd'], how='left')
                     grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
                     del enc_feats,feats,base

                     grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                     grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
                     
                     pred_start = val_end + step - VAL_DAYS + 1
                     pred_end = val_end + step 
                     print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
                     trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                                   
                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

                     lgbm.fit(trainX, trainY,
                            eval_set=[(valX, valY)],
                            eval_metric='rmse',
                            callbacks=callbacks)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)
              
# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_depts.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Depts only. Step2. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 2.47246
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[227]	valid_0's rmse: 2.47259
Val start: 1914. Val end 1941. Test start 1946 Pred end 1947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[263]	valid_0's rmse: 2.47096
Val start: 1914. Val end 1941. Test start 1948 Pred end 1949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[230]	valid_0's rmse: 2.47657
Val start: 1914. Val end 1941. Test start 1950 Pred end 1951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[312]	valid_0's rmse: 2.4789
Val start: 1914. Val en

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.51MB/s]


Successfully submitted to M5 Forecasting - Accuracy

In [None]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for cat in CATS:
              for step in STEPS:
                     base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
                     feats = pd.read_pickle('itermediate_dfs/lags.pkl')
                     enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
                     base = base[(base['store_id']==store) & (base['cat_id']==cat)]
                     enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
                     grid = base.merge(feats, on=['id', 'd'], how='left')
                     grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
                     del enc_feats,feats,base

                     grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                     grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
                     
                     pred_start = val_end + step - VAL_DAYS + 1
                     pred_end = val_end + step 
                     print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
                     trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                                   
                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

                     lgbm.fit(trainX, trainY,
                            eval_set=[(valX, valY)],
                            eval_metric='rmse',
                            callbacks=callbacks)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)

# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_cats.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Stores only. Step14. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[282]	valid_0's rmse: 2.15713
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[218]	valid_0's rmse: 2.15882
Val start: 1914. Val end 1941. Test start 1946 Pred end 1947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[232]	valid_0's rmse: 2.16178
Val start: 1914. Val end 1941. Test start 1948 Pred end 1949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[254]	valid_0's rmse: 2.16168
Val start: 1914. Val end 1941. Test start 1950 Pred end 1951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[287]	valid_0's rmse: 2.16507
Val start: 1914. Val e

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.69MB/s]


Successfully submitted to M5 Forecasting - Accuracy

Experiments

1. Promotions, are basically discocunts. Can we add a promotion flag. Can we create promotion features?
2. Check when do prices raise or deep in anticipation of an event. Do price*holiday_flag
3. Same as 2 but with promotion flag. Price*promotion_flag
4. Store, depts, cats rolling lags and lags.

In [None]:
preds1 = pd.read_pickle(f'submissions/no_dim_2_1_depts.pkl')
preds2 = pd.read_pickle(f'submissions/no_dim_2_1_cats.pkl')
preds3 = pd.read_pickle(f'submissions/no_dim_2_1_stores.pkl')

preds1.set_index(['id', 'd'], inplace=True)
preds2.set_index(['id', 'd'], inplace=True)
preds3.set_index(['id', 'd'], inplace=True)

df_avg = (preds1 + preds2+ preds3) / 3
df_avg.reset_index(inplace=True)

submission = pd.read_csv('data/sample_submission.csv')
df_avg = df_avg.pivot(index='id', columns='d', values='sales').reset_index()
df_avg.columns = submission.columns
df_avg = submission[['id']].merge(df_avg, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
df_avg.to_csv(f'{submission_file}', index=False)
message = "Step 2. Stores, Depts and Cats"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.35MB/s]


Successfully submitted to M5 Forecasting - Accuracy