In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping
  

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'device_type': 'cpu',
    'subsample_freq': 1,
    'min_child_weight': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400, #1400
    'boost_from_average': False,
    'verbosity': -1
    }

In [2]:
horizon = 28
base = pd.read_pickle('itermediate_dfs/no_feat.pkl')

STEPS = list(np.arange(2,30,2))
TARGET = ['sales']
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
STORES = base.store_id.unique()
DEPTS = base.dept_id.unique()
CATS = base.cat_id.unique()

train_start = 1
train_end = 1941-28
val_start = 1942-28
val_end = 1941
first_pred_day = 1942

feats = pd.read_pickle('itermediate_dfs/lags.pkl')
enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')

remove_colums = ['id','item_id','dept_id','cat_id','store_id','state_id','d','sales','wm_yr_wk','date']
enc_columns = enc_feats.columns[2:]
lags_columns = list(feats.columns[2:]) + list(enc_columns)
train_columns = list(base.columns[~base.columns.isin(remove_colums)]) + list(lags_columns)
del base, feats

In [3]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for step in STEPS:
              base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
              feats = pd.read_pickle('itermediate_dfs/lags.pkl')
              enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
              base = base[(base['store_id']==store)]
              feats = feats[feats['id'].isin(base.id.unique())]
              enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
              grid = base.merge(feats, on=['id', 'd'], how='left')
              grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
              del enc_feats,feats,base

              grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
              grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
              
              pred_start = val_end + step - VAL_DAYS + 1
              pred_end = val_end + step 
              print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
              trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
              trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
              valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
              valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
              testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                            
              # Train
              lgbm = LGBMRegressor(**lgb_params)
              callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

              lgbm.fit(trainX, trainY,
                     eval_set=[(valX, valY)],
                     eval_metric='rmse',
                     callbacks=callbacks)

              # Predict
              yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
              preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
              preds['sales'] = yhat
              predictions = pd.concat([predictions, preds], axis=0)

# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_stores.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Stores only. Step14. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[387]	valid_0's rmse: 2.01758
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[274]	valid_0's rmse: 2.05222
Val start: 1914. Val end 1941. Test start 1946 Pred end 1947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's rmse: 2.07466
Val start: 1914. Val end 1941. Test start 1948 Pred end 1949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[255]	valid_0's rmse: 2.10049
Val start: 1914. Val end 1941. Test start 1950 Pred end 1951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[252]	valid_0's rmse: 2.12445
Val start: 1914. Val e

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.48MB/s]


Successfully submitted to M5 Forecasting - Accuracy

In [4]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for dept in DEPTS:
              for step in STEPS:
                     base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
                     feats = pd.read_pickle('itermediate_dfs/lags.pkl')
                     enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
                     base = base[(base['store_id']==store) & (base['dept_id']==dept)]
                     feats = feats[feats['id'].isin(base.id.unique())]
                     enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
                     grid = base.merge(feats, on=['id', 'd'], how='left')
                     grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
                     del enc_feats,feats,base

                     grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                     grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
                     
                     pred_start = val_end + step - VAL_DAYS + 1
                     pred_end = val_end + step 
                     print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
                     trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                                   
                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

                     lgbm.fit(trainX, trainY,
                            eval_set=[(valX, valY)],
                            eval_metric='rmse',
                            callbacks=callbacks)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)
              
# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_depts.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Depts only. Step2. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[476]	valid_0's rmse: 2.47484
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[287]	valid_0's rmse: 2.47486
Val start: 1914. Val end 1941. Test start 1946 Pred end 1947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[280]	valid_0's rmse: 2.47919
Val start: 1914. Val end 1941. Test start 1948 Pred end 1949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[289]	valid_0's rmse: 2.47645
Val start: 1914. Val end 1941. Test start 1950 Pred end 1951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[268]	valid_0's rmse: 2.48452
Val start: 1914. Val e

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.59MB/s]


Successfully submitted to M5 Forecasting - Accuracy

In [5]:
predictions = pd.DataFrame()
for store in STORES:
       print(f'************ Training Store {store+1} ************')
       for cat in CATS:
              for step in STEPS:
                     base = pd.read_pickle('itermediate_dfs/no_feat.pkl')
                     feats = pd.read_pickle('itermediate_dfs/lags.pkl')
                     enc_feats = pd.read_pickle('itermediate_dfs/enc_feats.pkl')
                     base = base[(base['store_id']==store) & (base['cat_id']==cat)]
                     enc_feats = enc_feats[enc_feats['id'].isin(base.id.unique())]
                     grid = base.merge(feats, on=['id', 'd'], how='left')
                     grid = grid.merge(enc_feats, on=['id', 'd'], how='left')
                     del enc_feats,feats,base

                     grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                     grid.drop(index=grid[(grid['d'] <= 1941) & (grid.isna().any(axis=1))].index, inplace=True)
                     
                     pred_start = val_end + step - VAL_DAYS + 1
                     pred_end = val_end + step 
                     print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')
                     trainX = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] >= train_start) & (grid['d'] <= train_end)][TARGET]
                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                                   
                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

                     lgbm.fit(trainX, trainY,
                            eval_set=[(valX, valY)],
                            eval_metric='rmse',
                            callbacks=callbacks)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)

# Submission
predictions.to_pickle(f'submissions/no_dim_{VAL_DAYS}_{train_start}_cats.pkl')
submission = pd.read_csv('data/sample_submission.csv')
predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
predictions.columns = submission.columns
predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
predictions.to_csv(f'{submission_file}', index=False)
message = "Stores only. Step14. All data"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

************ Training Store 1 ************
Val start: 1914. Val end 1941. Test start 1942 Pred end 1943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[421]	valid_0's rmse: 2.15736
Val start: 1914. Val end 1941. Test start 1944 Pred end 1945
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[435]	valid_0's rmse: 2.1586
Val start: 1914. Val end 1941. Test start 1946 Pred end 1947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[231]	valid_0's rmse: 2.16618
Val start: 1914. Val end 1941. Test start 1948 Pred end 1949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[265]	valid_0's rmse: 2.16293
Val start: 1914. Val end 1941. Test start 1950 Pred end 1951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[330]	valid_0's rmse: 2.16546
Val start: 1914. Val en

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.49MB/s]


Successfully submitted to M5 Forecasting - Accuracy

Ensemble

In [7]:
preds1 = pd.read_pickle(f'submissions/no_dim_2_1_depts.pkl')
preds2 = pd.read_pickle(f'submissions/no_dim_2_1_cats.pkl')
preds3 = pd.read_pickle(f'submissions/no_dim_2_1_stores.pkl')

preds1.set_index(['id', 'd'], inplace=True)
preds2.set_index(['id', 'd'], inplace=True)
preds3.set_index(['id', 'd'], inplace=True)

df_avg = (preds1 + preds2+ preds3) / 3
df_avg.reset_index(inplace=True)

submission = pd.read_csv('data/sample_submission.csv')
df_avg = df_avg.pivot(index='id', columns='d', values='sales').reset_index()
df_avg.columns = submission.columns
df_avg = submission[['id']].merge(df_avg, on='id', how='left').fillna(1)
submission_file = "submissions/submission.csv"
df_avg.to_csv(f'{submission_file}', index=False)
message = "Step 2. Stores, Depts and Cats"
competition_name = "m5-forecasting-accuracy"
submit_to_kaggle(competition_name, submission_file, message)

100%|██████████| 20.5M/20.5M [00:04<00:00, 4.68MB/s]


Successfully submitted to M5 Forecasting - Accuracy

1. Promotions, are basically discocunts. Can we add a promotion flag. Can we create promotion features?
2. Check when do prices raise or deep in anticipation of an event. Do price*holiday_flag
3. Same as 2 but with promotion flag. Price*promotion_flag
4. Store, depts, cats rolling lags and lags.